In [1]:
.libPaths( c( .libPaths(), "/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library") )
In [66]:
.libPaths()
  1. '/home/sirenkom/R/x86_64-pc-linux-gnu-library/3.6'
  2. '/juno/work/isabl/home/gaot/R/R-3.6.1/library'
  3. '/juno/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library'
In [3]:
library(dplyr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(stringr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(data.table, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(ggbeeswarm, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(glue, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(ggforce, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(pec, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(patchwork, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(survival, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(survminer, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(IRdisplay, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(cowplot, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
# library(igraph)
library(ggpubr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library") # load before survminer
library(ggrepel, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(kableExtra, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
source('/work/isabl/home/gaot/facets-ch/R/facets-ch-utils.R')
source('/work/isabl/home/gaot/ch_cnv/ch-cnv/notebooks/plot_forest.R')
setwd('/work/isabl/home/gaot/ch_cnv')
Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


Loading required package: ggplot2

Warning message:
“package ‘ggplot2’ was built under R version 3.6.3”

Attaching package: ‘glue’


The following object is masked from ‘package:dplyr’:

    collapse


Loading required package: prodlim

Loading required package: ggpubr

Error: package or namespace load failed for ‘ggpubr’ in dyn.load(file, DLLpath = DLLpath, ...):
 unable to load shared object '/home/sirenkom/R/x86_64-pc-linux-gnu-library/3.6/curl/libs/curl.so':
  libR.so: cannot open shared object file: No such file or directory

Error: package ‘ggpubr’ could not be loaded
Traceback:

1. library(survminer, lib.loc = "/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
2. .getRequiredPackages2(pkgInfo, quietly = quietly)
3. stop(gettextf("package %s could not be loaded", sQuote(pkg)), 
 .     call. = FALSE, domain = NA)
In [4]:
library("ggpubr", lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
In [5]:
library(dplyr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(stringr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(data.table, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(ggbeeswarm, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(glue, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(ggforce, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(pec, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(patchwork, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(survival, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(survminer, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(IRdisplay, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(cowplot, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
# library(igraph)
library(ggpubr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library") # load before survminer
library(ggrepel, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(kableExtra, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
source('/work/isabl/home/gaot/facets-ch/R/facets-ch-utils.R')
source('/work/isabl/home/gaot/ch_cnv/ch-cnv/notebooks/plot_forest.R')
setwd('/work/isabl/home/gaot/ch_cnv')
********************************************************

Note: As of version 1.0.0, cowplot does not change the

  default ggplot2 theme anymore. To recover the previous

  behavior, execute:
  theme_set(theme_cowplot())

********************************************************



Attaching package: ‘cowplot’


The following object is masked from ‘package:ggpubr’:

    get_legend


The following object is masked from ‘package:patchwork’:

    align_plots



Attaching package: ‘kableExtra’


The following object is masked from ‘package:dplyr’:

    group_rows


In [6]:
getwd()
'/juno/work/isabl/home/gaot/ch_cnv'

Preprocess

In [7]:
M_wide = fread('./data/sm_wide_jul13.tsv')
# segs = fread('./data/segs_aberrant_may6.tsv', sep = '\t')

segs = rbind(
    fread('./data/segs_aberrant_may18.tsv', sep = '\t'),
    fread('./data/segs_aberrant_nonparta.tsv', sep = '\t'),
    fill = TRUE
)

segs_filtered_old = fread('./data/segs_aberrant_apr19.tsv', sep = '\t')
gene_bed = fread('./data/impact_genes.tsv') %>% 
    mutate(chrom = ifelse(chrom == 'X', 23, chrom)) %>%
    mutate(chrom = ifelse(chrom == 'Y', 24, chrom)) %>%
    mutate(chrom = as.integer(chrom))
chrom_lens = fread('/home/gaot/ref/chrom_lens.tsv') %>% filter(chrom != 24)

acen = fread('/home/gaot/ref/acen.tsv')
colnames(acen) = c('chrom', 'start', 'end', 'arm', 'type')
chrom_lens = fread('/home/gaot/ref/chrom_lens.tsv') %>% filter(chrom != 24)
chrom_levels = c(as.character(1:22), 'X')

acen = acen %>% 
    mutate(chrom = str_remove(chrom, 'chr')) %>%
    filter(chrom != 'Y') %>%
    mutate(
        chrom = ifelse(chrom == 'X', 23, chrom),
        chrom = as.integer(chrom)
    ) %>%
    arrange(chrom) %>%
    group_by(chrom) %>%
    summarise(cen_start = min(start), cen_end = max(end)) %>%
    ungroup()

# mCA positive patient EMR text search
emr_review = fread('./data/clinicalreview/cnv_patient_asm_review.tsv') %>% filter(exclude == 1)

M_wide = M_wide %>% 
    mutate(
        DateofProcedure = as.Date(DateofProcedure, format = '%Y-%m-%d'),
        date_birth = as.Date(date_birth, format = '%Y-%m-%d'),
        date_death = as.Date(date_death, format = '%Y-%m-%d')
    ) %>%
    mutate(age = as.numeric((DateofProcedure - date_birth)/365)) %>%
    filter(!active_heme) %>%
    filter(QC %in% c('Passed', '#N/A')) %>%
    filter(age > 20) %>%
    filter(!dmp_patient_id %in% emr_review$dmp_patient_id)

# clean up heme diagnosis
M_wide = M_wide %>%
    mutate(heme_cat_billing = heme_cat) %>%
    mutate(heme_cat = ifelse(is.na(heme_cat_review), heme_cat, heme_cat_review)) %>%
    mutate(
        manual_review = as.integer(!is.na(valid_diagnosis)),
        valid_diagnosis = ifelse(is.na(valid_diagnosis), 0, valid_diagnosis)
    ) %>%
    mutate(
        heme_cat = ifelse(valid_diagnosis == 0 & manual_review, 'Invalid', heme_cat),
        post_heme = ifelse(valid_diagnosis == 1 & manual_review, secondary_heme, 0)
    ) %>%
    mutate(
        post_cll = as.integer(heme_cat == 'CLL' & post_heme),
        post_mpn = as.integer(heme_cat %in% c('PV', 'ET', 'MPN') & post_heme),
        post_tmn = as.integer(heme_cat %in% c('AML', 'MDS') & post_heme),
        post_cml = as.integer(heme_cat %in% c('CML') & post_heme)
    ) %>%
    mutate(
        post_anyblood = post_heme,
        post_mn = as.integer(post_mpn | post_tmn),
        post_leukemia = as.integer(post_cll | post_mpn | post_tmn | post_cml)
    ) %>%
    mutate(heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat))

# recode hemecat
M_wide = M_wide %>% 
    mutate(heme_cat2 = heme_cat) %>%
    mutate(
        heme_cat2 = ifelse(heme_cat2 == 'LCH', NA, heme_cat2),
        heme_cat2 = ifelse(heme_cat2 %in% c('ET', 'PV'), 'MPN', heme_cat2),
        heme_cat2 = ifelse(heme_cat2 %in% c('ALCL', 'CTCL'), 'T-NHL', heme_cat2),
        heme_cat2 = ifelse(heme_cat2 %in% c('DLBCL', 'FL', 'MZL', 'PCNSL', 'WM'), 'B-NHL', heme_cat2)
    ) %>%
    mutate(heme_cat2 = factor(heme_cat2, c('MDS', 'MPN', 'AML', 'CML', 'CLL', 'B-NHL', 'T-NHL', 'MM')))

M_wide = M_wide %>% mutate(post_heme = ifelse(is.na(heme_cat2), 0, post_heme))

display(paste0(sum(M_wide$post_tmn), 'tMN, ', sum(M_wide$post_cll), 'CLL, ', sum(M_wide$post_mpn), 'MPN'))

# clean up survival data
M_wide = M_wide %>%
    mutate(race_b = ifelse(Race == "WHITE",1,0)) %>%
    rowwise() %>%
    mutate(
        date_lastfu = as.Date(date_lastfu, format = '%Y-%m-%d'),
        # sometimes last contact date is not accurate
        time_lastfu = max(as.integer(date_lastfu - DateofProcedure), 0), 
        time_tmn = ifelse(post_tmn == 1, seq_to_diag, time_lastfu),
        time_mn = ifelse(post_mn == 1, seq_to_diag, time_lastfu),
        time_cll = ifelse(post_cll == 1, seq_to_diag, time_lastfu),
        time_anyblood = ifelse(post_anyblood == 1, seq_to_diag, time_lastfu),
        time_leukemia = ifelse(post_leukemia == 1, seq_to_diag, time_lastfu)
    ) %>%
    ungroup() %>%
    mutate(
        ch_sm = ch_all
    ) %>%
    mutate(tumor_type = GeneralTumorType)

# clean up smoke
M_wide = M_wide %>%
    mutate(smoke = ifelse(is.na(smoke_bin), 'missing', c('1' = 'yes', '0' = 'no')[as.character(smoke_bin)])) %>%
    mutate(smoke = factor(smoke, c('no', 'yes', 'missing')))

therapy_detailed = fread('./data/therapy_chemoxrt.tsv')
therapy_detailed = therapy_detailed %>% filter(complete.cases(.))

M_wide = M_wide %>% 
    left_join(
            therapy_detailed,
            by = "dmp_patient_id"
        ) %>%
        mutate(therapy_detailed = !is.na(XRT))

### merge with CNV calls
failed_samples = segs %>% filter(is.na(seg)) %>% pull(dmp_sample_id)

M_wide = M_wide %>% filter(!dmp_sample_id %in% failed_samples)

M_wide %>% count(therapy_detailed)

M_long = fread('./data/sm_long_jul13.tsv') %>%
    filter(dmp_patient_id %in% M_wide$dmp_patient_id)

M_long = M_long %>% 
    rename(chrom = Chrom, start = Start) %>%
    mutate(chrom = as.integer(ifelse(chrom == 'X', 23, chrom)))

M_long = M_long %>% distinct(dmp_sample_id, start, Ref, Alt, `.keep_all` = T)

M_long = M_long %>% mutate(
        VariantClass2 = case_when(
            VariantClass %in% c('Translation_Start_Site', "5'Flank", "3'Flank", "3'UTR", "5'UTR") ~ 'Regulatory',
            VariantClass %in% c('Missense_Mutation', 'In_Frame_Del', 'In_Frame_Ins') ~ 'Missense',
            VariantClass %in% c('Nonsense_Mutation', 'Nonstop_Mutation', 'Splice_Site', 'Splice_Region', 'Frame_Shift_Del', 'Frame_Shift_Ins') ~ 'Truncating',
            T ~ VariantClass)
    )

segs = segs %>% 
    filter(!is.na(seg)) %>%
    inner_join(M_wide) %>%
    mutate(active_heme = ifelse(is.na(active_heme), T, active_heme))

segs = segs %>% mutate(
    z_x = ifelse(is.na(z_x), t_cnlr, z_x),
    z_y = ifelse(is.na(z_y), t_valor, z_y),
    p_x = ifelse(is.na(p_x), p_cnlr, p_x),
    p_y = ifelse(is.na(p_y), p_valor, p_y)
)

segs_filtered = segs %>%
    ungroup() %>%
    filter((p_chisq < 1e-10 & p_y < 1e-3)) %>%
    filter(type != 'err' & err < 0.06) %>%
    # germline amplifications
    filter(!(phi >= 0.8 & type %in% c('amp'))) %>%
    filter(!(type == 'amp' & size < 24e6)) %>% 
    filter(cnlr <= 0.5) %>%
    filter(!(type == 'loh' & size < 8e6)) %>%
    filter(!(chrom == 23 & Gender == 'Male' & type == 'loh')) %>%
    filter(!(chrom == 23 & type == 'amp')) %>%
    # recurrent focal artifacts
    filter(!(chrom == 2 & type == 'del' & start > 11e7 & end < 13e7)) %>%
    filter(!(chrom == 11 & type == 'del' & start > 60e6 & end < 70e6)) %>%
    filter(!(chrom == 7 & start > 40e6 & end < 70e6))

# circulating tumor DNA
segs_filtered = segs_filtered %>% filter(dmp_patient_id != 'P-0027364')

segs_filtered = segs_filtered %>% 
    left_join(chrom_lens %>% rename(chrom_length = length), by = 'chrom') %>%
    mutate(prop_chrom = size / chrom_length) %>%
    mutate(focality = ifelse(prop_chrom < 0.15, 'focal', 'broad')) %>%
    mutate(cnv_label = paste0(chrom, c('amp' = '+', 'del' = '-', 'loh' = '=')[type])) %>%
    mutate(type2 = factor(c('del' = 'DEL', 'loh' = 'CNLOH', 'amp' = 'AMP')[type], c('DEL', 'CNLOH', 'AMP')))

# annodate locality
segs_filtered = segs_filtered %>% left_join(acen, by = "chrom")

segs_filtered = segs_filtered %>% 
    rowwise() %>%
    mutate(
        p_length = cen_start,
        q_length = chrom_length - cen_end,
        p_size = max(min(end, cen_start) - max(start, 0), 0),
        q_size = max(min(end, chrom_length) - max(start, cen_end), 0),
        p_frac = p_size/p_length,
        q_frac = q_size/q_length,
        arm = case_when(
            p_frac > 0.5 & q_frac > 0.5 ~ 'p,q',
            p_frac > q_frac ~ 'p',
            T ~ 'q'
        ),
        focality = ifelse(max(p_frac, q_frac) < 0.1, 'focal', 'broad')
    ) %>%
    ungroup()

n_samples = segs_filtered %>% group_by(dmp_patient_id, dmp_sample_id) %>% summarise(n_samples = length(unique(dmp_sample_id))) %>% pull(n_samples) %>% max

display(n_samples)

display(nrow(segs_filtered))

# gemrline calls
M_germline = fread('./data/germline_muts.tsv')
M_germline = M_germline %>% filter(dmp_patient_id %in% M_wide$dmp_patient_id)

M_germline_wide = M_germline %>% 
    mutate(ind_germline = 1) %>%
    reshape2::dcast(
        dmp_patient_id ~ Gene,
        value.var = 'ind_germline',
        fun.aggregate = max,
        fill = 0
    ) %>%
    setNames(c('dmp_patient_id', paste0(colnames(.)[-1], '_g'))) %>%
    mutate(any_germline = 1)

M_wide = M_wide %>% left_join(
        M_germline_wide,
        by = 'dmp_patient_id'
    ) %>%
    mutate_at(colnames(M_germline_wide), function(x){ifelse(is.na(x), 0, x)})

segs_filtered = segs_filtered %>% 
    left_join(M_germline_wide, by = "dmp_patient_id")

# merge with GM data
CNV = segs_filtered %>% 
    mutate(ch_cnv = 1) %>%
    group_by(dmp_patient_id, chrom) %>%
    summarise(ch_cnv = max(ch_cnv)) %>%
    arrange(chrom) %>%
    ungroup() %>%
    mutate(chrom = paste0('chr', chrom)) %>%
    mutate(chrom = factor(chrom, unique(chrom))) %>%
    reshape2::dcast(
        dmp_patient_id ~ chrom,
        value.var = 'ch_cnv',
        fill = 0
    ) %>%
    mutate(ch_cnv = 1)

CNV_detailed = segs_filtered %>% 
    mutate(ch_cnv = 1) %>%
    group_by(dmp_patient_id, chrom, type) %>%
    summarise(ch_cnv = max(ch_cnv)) %>%
    arrange(chrom) %>%
    ungroup() %>%
    mutate(chrom = paste0('chr', chrom)) %>%
    mutate(chrom = factor(chrom, unique(chrom))) %>%
    reshape2::dcast(
        dmp_patient_id ~ chrom + type,
        value.var = 'ch_cnv',
        fill = 0
    )

CNV_detailed_arm = segs_filtered %>% 
    mutate(ch_cnv = 1) %>%
    group_by(dmp_patient_id, chrom, type, arm) %>%
    summarise(ch_cnv = max(ch_cnv)) %>%
    arrange(chrom) %>%
    ungroup() %>%
    mutate(chrom = paste0('chr', chrom)) %>%
    mutate(chrom = factor(chrom, unique(chrom))) %>%
    mutate(arm = str_remove(arm, ',')) %>%
    reshape2::dcast(
        dmp_patient_id ~ chrom + arm + type,
        value.var = 'ch_cnv',
        fill = 0
    )

CNV_types = segs_filtered %>% 
    mutate(ch_cnv = 1) %>%
    mutate(type = paste0('ch_', type)) %>%
    group_by(dmp_patient_id, type) %>%
    summarise(ch_cnv = max(ch_cnv)) %>%
    ungroup() %>%
    reshape2::dcast(
        dmp_patient_id ~ type,
        value.var = 'ch_cnv',
        fill = 0
    )

segs_filtered_auto = segs_filtered %>% filter(chrom != 23)

window = 5e6

max0 = function(x) {
  if (length(x) == 0) {
    return(0)
  } else {
    return(max(x))
  }
}

MAXVAF_trans = M_long %>%
    filter(ch_nonsilent == 1) %>%
    rowwise() %>%
    mutate(
        locality = ifelse(
            any(chrom == segs_filtered_auto$chrom & dmp_patient_id == segs_filtered_auto$dmp_patient_id &
                start >= segs_filtered_auto$start - window & start <= segs_filtered_auto$end + window
            ),
            'cis',
            'trans'
        )
    ) %>%
    ungroup() %>%
    filter(locality == 'trans') %>%
    group_by(dmp_patient_id) %>%
    summarise(
        VAF_trans = max(VAF_N),
        mutnum_trans = sum(ch_nonsilent),
        mutnum_pd_trans = sum(ch_nonsilent & ch_pd),
        VAF_pd_trans = max0(VAF_N[ch_pd == 1])
    )

chroms = colnames(CNV)[-1]
chroms = chroms[chroms != 'ch_cnv']
genes = c('DNMT3A', 'TET2', 'ASXL1', 'JAK2', 'TP53', 'PPM1D', 'ATM', 'EZH2')

M_wide = M_wide %>%
    left_join(CNV, by = "dmp_patient_id") %>%
    left_join(CNV_detailed, by = "dmp_patient_id") %>%
    left_join(CNV_types, by = "dmp_patient_id") %>%
    left_join(CNV_detailed_arm, by = "dmp_patient_id") %>%
    left_join(MAXVAF_trans, by = 'dmp_patient_id') %>%
    left_join(
        segs_filtered_auto %>%
            group_by(dmp_patient_id) %>%
            summarise(
                phi_cnv = max(phi),
                phi_cnv_auto = max(phi[chrom != 23]),
                n_cnv = n(),
                n_cnv_chrom = length(unique(chrom)),
                ch_cnv_auto = 1,
                .groups = 'drop'),
        by = 'dmp_patient_id'
    ) %>%
    mutate_at(colnames(CNV), function(x){ifelse(is.na(x), 0, x)}) %>%
    mutate_at(colnames(CNV_detailed), function(x){ifelse(is.na(x), 0, x)}) %>%
    mutate_at(colnames(CNV_detailed_arm), function(x){ifelse(is.na(x), 0, x)}) %>%
    mutate_at(colnames(CNV_types), function(x){ifelse(is.na(x), 0, x)}) %>%
    mutate_at(
        c('phi_cnv', 'VAF_trans', 'VAF_pd_trans', 'n_cnv', 'n_cnv_chrom', 'ch_cnv_auto', 'mutnum_trans', 'mutnum_pd_trans', 'phi_cnv_auto', 'phi_cnv'),
        function(x){ifelse(is.na(x), 0, x)}
    )

M_long = M_long %>%
    left_join(CNV, by = "dmp_patient_id") %>%
    left_join(CNV_detailed, by = "dmp_patient_id") %>%
    left_join(CNV_types, by = "dmp_patient_id") %>%
    left_join(CNV_detailed_arm, by = "dmp_patient_id") %>%
    mutate_at(colnames(CNV), function(x){ifelse(is.na(x), 0, x)}) %>%
    mutate_at(colnames(CNV_detailed), function(x){ifelse(is.na(x), 0, x)}) %>%
    mutate_at(colnames(CNV_types), function(x){ifelse(is.na(x), 0, x)}) %>%
    mutate(ch_cnv_auto = as.integer(ch_cnv & (!chr23)))

M_wide = M_wide %>%
    mutate(age_bin = cut(age, c(seq(20, 80, 10), Inf))) %>%
    mutate(
        age10 = age/10,
        age10_sq = age10^2
    )

M_wide = M_wide %>%
    mutate(
        Race = case_when(
            Race == 'ASIAN-FAR EAST/INDIAN SUBCONT' ~ 'asian',
            Race == 'BLACK OR AFRICAN AMERICAN' ~ 'black',
            Race == 'WHITE' ~ 'white',
            T ~ 'other'
        )
    )

# loss of Y
segs_y = fread('./data/y_segs.tsv', sep = '\t')
LOY = segs_y %>% filter(p_x < 1e-17 & cnlr < 0)
M_wide = M_wide %>%
    mutate(loy = ifelse(Gender == 'Male', dmp_sample_id %in% LOY$dmp_sample_id, NA)) %>% 
    left_join(LOY %>% mutate(loy_logr = cnlr)) %>%
    mutate(loy_dose = ifelse(loy == 1, -exp(loy_logr), 0)) %>%
    mutate(loy_bin = as.character(ifelse(loy == 1, cut(loy_dose, 3), 0)))

# cbcs
cbcs = c("anc", "alc", "amc", "hgb", "mcv", "rdw", "plt", "wbc", "rbc")
M_wide = M_wide %>% mutate(rdw = rdw * mcv)

M_wide = M_wide %>%
    mutate_at(cbcs, .funs = list(n = function(x){(x - mean(na.omit(x))) / sd(na.omit(x))}))

nrow(M_wide) %>% paste(' patients valid for analysis!')

cnv_pal = c('loh' = 'darkgreen', 'amp' = 'darkblue', 'del' = 'darkred', 'err' = 'gray')
cnv_pal2 = c('CNLOH' = 'darkgreen', 'AMP' = 'darkblue', 'DEL' = 'darkred')
sm_pal = c('Regulatory' = 'darkturquoise', 'Missense' = 'salmon', 'Truncating' = 'purple')

gene_structures = fread('./data/gene_structures.tsv', sep = '\t') %>% filter(class == 'protein_coding')

sm_col = '#FDCDAC'
cnv_col = '#B3E2CD'

cbc_labels = c('anc' = 'Neutrophil', 'alc' = 'Lymphocyte', 'amc' = 'Monocyte',
  'hgb' = 'Hemoglobin', 'mcv' = 'Mean corpuscular volume', 
  'rdw' = 'Red cell distribution width', 'plt' = 'Platelets',
  'wbc' = 'White blood cell', 'rbc' = 'Red blood cell')

do_plot = function(p, f, w, h) {
    ggsave(filename = paste0('/work/isabl/home/sirenkom/teng_mCA/figures/', f), plot = p, width = w, height = h, device = 'pdf')
    options(repr.plot.width = w, repr.plot.height = h, repr.plot.res = 300)
    print(p)
}
`summarise()` ungrouping output (override with `.groups` argument)

'37tMN, 12CLL, 7MPN'
A tibble: 2 × 2
therapy_detailedn
<lgl><int>
FALSE22067
TRUE10375
Joining, by = "dmp_sample_id"

`summarise()` regrouping output by 'dmp_patient_id' (override with `.groups` argument)

1
383
`summarise()` regrouping output by 'dmp_patient_id' (override with `.groups` argument)

`summarise()` regrouping output by 'dmp_patient_id', 'chrom' (override with `.groups` argument)

`summarise()` regrouping output by 'dmp_patient_id', 'chrom', 'type' (override with `.groups` argument)

`summarise()` regrouping output by 'dmp_patient_id' (override with `.groups` argument)

`summarise()` ungrouping output (override with `.groups` argument)

Joining, by = "dmp_sample_id"

'32442 patients valid for analysis!'
In [766]:
# for getting gene structures
# gtf = fread('/work/isabl/ref/homo_sapiens/GRCh37d5/ensembl/75/Homo_sapiens.GRCh37.75.gtf', skip = 'gene_id')
# colnames(gtf) = c('chrom', 'class', 'structure_type', 'start', 'end', 'x', 'y', 'z', 'description')
# gene_structures = gtf %>% filter(str_detect(description, paste(paste0('"', gene_bed$Gene, '"'), collapse = '|')) & structure_type == 'exon')
In [9]:
# # export for sebatia
# M_wide_export = M_wide %>% 
#     select(one_of(c('dmp_patient_id', 'dmp_sample_id', 'age', 'Gender', 'ch_nonsilent', colnames(CNV), colnames(CNV_detailed), colnames(CNV_types))))

# M_wide_export %>% fwrite('./data/ch_cnvs_sebastia.tsv')
In [269]:
# M_wide %>% filter(ch_cnv == 1) %>% select(dmp_patient_id, nonparta) %>% fwrite('./data/clinicalreview/cnv_patient_review_jul12.tsv', sep = '\t')
In [6]:
M_long
A data.table: 20405 × 299
chromstartRefAltvariant_idVAF_NVariantClassGeneExoncDNAchangechr20_q_delchr20_q_lohchr21_q_delchr21_q_lohchr22_q_ampchr22_q_delchr22_q_lohchr23_p_delchr23_pq_delch_cnv_auto
<int><int><chr><chr><chr><dbl><chr><chr><int><chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><int>
7116397849G A 7_116397849_G_A 0.08060Intron MET NAc.2102+21G>A NANANANANANANANANA0
11108196873T C 11_108196873_T_C0.03432Missense_MutationATM 47c.6896T>C NANANANANANANANANA0
16 396819C T 16_396819_C_T 0.02218Silent AXIN1 2c.207G>A NANANANANANANANANA0
2 25457158G A 2_25457158_G_A 0.04795Missense_MutationDNMT3A23c.2729C>T NANANANANANANANANA0
2 25463308G C 2_25463308_G_C 0.06355Missense_MutationDNMT3A19c.2185C>G NANANANANANANANANA0
2 25467158G A 2_25467158_G_A 0.09457Nonsense_MutationDNMT3A15c.1717C>T NANANANANANANANANA0
8 74885121C T 8_74885121_C_T 0.032845'Flank TCEB1 NA NANANANANANANANANA0
15 90631934C T 15_90631934_C_T 0.37615Missense_MutationIDH2 4c.419G>A NANANANANANANANANA0
17 74732959G A 17_74732959_G_A 0.38683Missense_MutationSRSF2 1c.284C>T NANANANANANANANANA0
12121434787C T 12_121434787_C_T0.02245Intron HNF1A NAc.1309+242C>T NANANANANANANANANA0
11108199761C G 11_108199761_C_G0.04701Missense_MutationATM 49c.7103C>G NANANANANANANANANA0
2 25457276GGAG 2_25457276_GGA_G0.02287Frame_Shift_Del DNMT3A23c.2609_2610delTCNANANANANANANANANA0
7 55225447G GT7_55225447_G_GT 0.02841Splice_Site EGFR 11c.1298+2dupT 0 0 0 0 0 0 0 0 10
9 87482373T C 9_87482373_T_C 0.06458Intron NTRK2 NAc.1633+27T>C 0 0 0 0 0 0 0 0 10
2 25464564A G 2_25464564_A_G 0.03987Missense_MutationDNMT3A17c.1949T>C NANANANANANANANANA0
7148516722T C 7_148516722_T_C 0.45528Missense_MutationEZH2 9c.965A>G NANANANANANANANANA0
1120483182C T 1_120483182_C_T 0.02222Missense_MutationNOTCH219c.3179G>A NANANANANANANANANA0
17 58740529C A 17_58740529_C_A 0.06551Nonsense_MutationPPM1D 6c.1434C>A NANANANANANANANANA0
17 40456448G A 17_40456448_G_A 0.02063Splice_Site STAT5A11c.1257+1G>A NANANANANANANANANA0
19 15281398C T 19_15281398_C_T 0.07776Intron NOTCH3NAc.4892-34G>A NANANANANANANANANA0
20 31021191G GT20_31021191_G_GT0.05872Frame_Shift_Ins ASXL1 11c.1191dupT NANANANANANANANANA0
4106193787G T 4_106193787_G_T 0.04784Missense_MutationTET2 10c.4249G>T NANANANANANANANANA0
12 12026408C A 12_12026408_C_A 0.12000Intron ETV6 NAc.1009+3505C>A NANANANANANANANANA0
5176523050C G 5_176523050_C_G 0.06135Intron FGFR4 14c.1822-8C>G NANANANANANANANANA0
15 42003465G T 15_42003465_G_T 0.21850Missense_MutationMGA 8c.3002G>T NANANANANANANANANA0
10112771558A G 10_112771558_A_G0.05214Silent SHOC2 9c.1731A>G NANANANANANANANANA0
12 49446161G A 12_49446161_G_A 0.02616Silent KMT2D 10c.1305C>T NANANANANANANANANA0
19 41748914G A 19_41748914_G_A 0.03428Missense_MutationAXL 11c.1439G>A NANANANANANANANANA0
2 25463583G A 2_25463583_G_A 0.02270Missense_MutationDNMT3A18c.2099C>T NANANANANANANANANA0
20 9547013G C 20_9547013_G_C 0.03581Missense_MutationPAK7 5c.1009C>G NANANANANANANANANA0
2 25469611A AC2_25469611_A_AC 0.24066Frame_Shift_Ins DNMT3A10c.1156dupG NANANANANANANANANA0
2198266757A G 2_198266757_A_G 0.03874Silent SF3B1 15c.2175T>C NANANANANANANANANA0
4106196282C T 4_106196282_C_T 0.07205Nonsense_MutationTET2 11c.4615C>T NANANANANANANANANA0
2 25467428C T 2_25467428_C_T 0.13982Missense_MutationDNMT3A14c.1648G>A NANANANANANANANANA0
2 25457242C T 2_25457242_C_T 0.10714Missense_MutationDNMT3A23c.2645G>A NANANANANANANANANA0
17 78617547T G 17_78617547_T_G 0.05208Silent RPTOR 3c.285T>G NANANANANANANANANA0
2 25464486C T 2_25464486_C_T 0.05764Missense_MutationDNMT3A17c.2027G>A NANANANANANANANANA0
23 70348481A G X_70348481_A_G 0.04247Missense_MutationMED12 24c.3388A>G NANANANANANANANANA0
11108216609C G 11_108216609_C_G0.03171Missense_MutationATM 58c.8558C>G NANANANANANANANANA0
2 25468892CA C 2_25468892_CA_C 0.25108Frame_Shift_Del DNMT3A12c.1470delT NANANANANANANANANA0
21 36206660G C 21_36206660_G_C 0.08287Intron RUNX1 NAc.805+47C>G NANANANANANANANANA0
23 66788864G T X_66788864_G_T 0.05575Intron AR NAc.1616+22260G>T NANANANANANANANANA0
23 20156717T C X_20156717_T_C 0.02132Missense_MutationEIF1AX 2c.40A>G NANANANANANANANANA0
2212288986G C 2_212288986_G_C 0.02793Silent ERBB4 23c.2760C>G NANANANANANANANANA0
2 25463296C CA2_25463296_C_CA 0.03887Frame_Shift_Ins DNMT3A19c.2196dupT NANANANANANANANANA0
2 25470910TA T 2_25470910_TA_T 0.03125Frame_Shift_Del DNMT3A 7c.850delT NANANANANANANANANA0
7148506428G A 0.48168Missense_MutationEZH2 18c.2084C>T 0 0 0 0 0 0 0 0 01
7148511124CAAGTC 0.76562Frame_Shift_Del EZH2 15c.1774_1777delACTT 0 0 0 0 0 0 0 0 01
17 7578551A G 0.18354Missense_MutationTP53 5c.379T>C 0 0 0 0 0 0 0 0 01
4106180785C A 0.77922Nonsense_MutationTET2 7c.3813C>A 0 0 0 0 0 0 0 0 01
4106156651A AT 0.64460Frame_Shift_Ins TET2 3c.1557dupT 0 0 0 0 0 0 0 0 01
4106196323TCAGAT 0.38462Frame_Shift_Del TET2 11c.4661_4664delCAGA 0 0 0 0 0 0 0 0 01
4106196690AT A 0.48288Frame_Shift_Del TET2 11c.5024delT 0 0 0 0 0 0 0 0 01
4106196580C A 0.76147Nonsense_MutationTET2 11c.4913C>A 0 0 0 0 0 0 0 0 01
4106162507G T 0.70417Nonsense_MutationTET2 4c.3421G>T 0 0 0 0 0 0 0 0 01
9 5073770G T 0.61688Missense_MutationJAK2 14c.1849G>T 0 0 0 0 0 0 0 0 01
4106157152CA C 0.59296Frame_Shift_Del TET2 3c.2056delA 0 0 0 0 0 0 0 0 01
7148506168C CA 0.41000Frame_Shift_Ins EZH2 19c.2187dupT 0 0 0 0 0 0 0 0 01
7148526937C A 0.64000Nonsense_MutationEZH2 5c.367G>T 0 0 0 0 0 0 0 0 01
1 43818306T G 0.48000Missense_MutationMPL 12c.1771T>G 0 0 0 0 0 0 0 0 01
In [9]:
# # sample list for Elli
# M_long %>% group_by(dmp_patient_id) %>%
# filter(any(Gene == 'TP53')) %>%
# left_join(
#     M_wide %>% select(dmp_sample_id, post_heme)
# ) %>%
# select(Gene, chrom, start, Ref, Alt, AAchange, dmp_patient_id, dmp_sample_id, ch_cnv, post_heme, heme_cat) %>%
# fwrite('./data/clinicalreview/tp53_all_variants.tsv', sep = '\t')
Joining, by = "dmp_sample_id"

Cohort characteristics

In [251]:
source('./ch-cnv/notebooks/table_one.R')

M_wide = M_wide %>% mutate(months_followup = as.integer(date_lastfu - DateofProcedure)/(365/12))

options(warn = -1)
table1 = tab_response(M_wide, xs = c('total', 'age', 'Gender', 'Race', 'smoke', 'months_followup', cbcs), y = 'ch_cnv')
options(warn = 0)
Loading required package: reshape2


Attaching package: ‘reshape2’


The following objects are masked from ‘package:data.table’:

    dcast, melt


In [253]:
kb = table1 %>% 
    mutate(overall = ifelse(variable %in% c('Gender', 'Race', 'smoke'), ' ', overall)) %>%
    rename(Overall = overall) %>%
    mutate(variable = ifelse(variable == 'smoke', 'Smoking history', variable)) %>%
    mutate(variable = ifelse(variable %in% names(cbc_labels), cbc_labels[variable], variable)) %>%
    mutate(variable = str_replace(variable, '-', ' ')) %>%
    mutate(levels = ifelse(levels == 'total', ' ', levels)) %>%
    mutate(variable = ifelse(variable == 'total', 'total subjects', variable)) %>%
    mutate(levels = Hmisc::capitalize(levels)) %>%
    mutate(variable = Hmisc::capitalize(variable)) %>%
    rename(
        'mCA-' = `0`, 'mCA+' = `1`,
        Characteristics = variable,
        ' ' = levels
    ) %>%
    kable(format = "html", booktabs = T) %>%
    collapse_rows(columns = 1:2, row_group_label_position = 'stack') %>%
    row_spec(0:nrow(table1), background = 'white', align = 'center', extra_css = "border-bottom: 1px solid") %>%
    row_spec(0, background = 'gray', extra_css = "border-bottom: 2px solid") %>%
    kable_styling(position = "center", full_width = F)

kb %>%
toString() %>%
display_html()

save_kable(kb, './figures/cohort.pdf')
Characteristics mCA- mCA+ Overall
Total subjects 32096 346 32442
Age Mean (SD) 60.5 (13.7) 69.5 (12.1) 60.6 (13.7)
Gender Female 17821 181
Male 14275 165
Race Asian 2286 13
Black 2088 15
Other 2738 18
White 24984 300
Smoking history No 14972 131
Yes 15560 207
Missing 1564 8
Months followup Mean (SD) 19.7 (16.4) 18.9 (14.9) 19.7 (16.3)
Neutrophil 5.2 (3.4) 5.3 (3.1) 5.2 (3.4)
Lymphocyte 1.5 (0.7) 1.6 (1.4) 1.5 (0.7)
Monocyte 0.6 (0.3) 0.7 (0.4) 0.6 (0.3)
Hemoglobin 12.5 (1.9) 12.4 (1.8) 12.5 (1.9)
Mean corpuscular volume 89.8 (6.5) 91.5 (6.6) 89.9 (6.5)
Red cell distribution width 1281.2 (197.5) 1326.6 (200.1) 1281.7 (197.6)
Platelets 257 (101.2) 240.2 (104.8) 256.9 (101.3)
White blood cell 7.4 (3.7) 7.8 (3.5) 7.4 (3.7)
Red blood cell 4.2 (0.6) 4.2 (0.6) 4.2 (0.6)
Note that HTML color may not be displayed on PDF properly.

save_kable will have the best result with magick installed. 

collect all segs

In [ ]:
# segs = P$dmp_sample_id %>%
#     lapply(function(sid){
#         outdir = '/work/isabl/home/gaot/ch_cnv/results'
#         segfile = glue('{outdir}/{sid}_seg.tsv')
#         if (file.exists(segfile)) {
#             seg = fread(segfile)
#             noisy = nrow(seg) > 35
#             if (noisy) {
#                 data.frame()
#             } else {
#                 seg %>% mutate(sample = sid) %>% filter(aberrant)
#             }
#         } else {
#             data.frame()
#         }
#     }) %>% Reduce(rbind, .)

# segs = segs %>% left_join(P, by = c('sample' = 'dmp_sample_id'))
In [811]:
# outdir = '/work/isabl/home/gaot/ch_cnv/results_jul2'

# segs = M_wide$dmp_sample_id %>%
#     lapply(function(sid){
#         segfile = glue('{outdir}/{sid}_seg.tsv')
#         if (file.exists(segfile)) {
#             seg = fread(segfile)
#             seg %>% mutate(dmp_sample_id = sid) %>% filter(aberrant)
#         } else {
#             data.frame(dmp_sample_id = c(sid)) %>% mutate(dmp_sample_id = as.character(dmp_sample_id))
#         }
#     }) %>% Reduce(bind_rows, .)

# print('done!')

# segs %>% fwrite('./data/segs_aberrant_jul2.tsv', sep = '\t')
In [675]:
# outdir = '/work/isabl/public/facets-ch/dmp_processed'

# segs = M_wide %>%
#     filter(nonparta == 1) %>%
#     pull(dmp_sample_id) %>%
#     lapply(function(sid){
#         segfile = glue('{outdir}/{sid}_seg.tsv')
#         if (file.exists(segfile)) {
#             seg = fread(segfile)
#             seg %>% mutate(dmp_sample_id = sid) %>% filter(aberrant)
#         } else {
#             data.frame(dmp_sample_id = c(sid)) %>% mutate(dmp_sample_id = as.character(dmp_sample_id))
#         }
#     }) %>% Reduce(bind_rows, .)

# print('done!')

# # correct for phi and variant type
# segs[c('type', 'phi', 'err')] = t(mapply(variant_type, segs$cnlr_adj, segs$valor, segs$aberrant))

# segs %>% fwrite('./data/segs_aberrant_nonparta.tsv', sep = '\t')
[1] "done!"

General CNV characteristics

In [6]:
segs_filtered %>% nrow %>% paste('unique events')
segs_filtered %>% count(auto = chrom != 23)
segs_filtered %>% pull(dmp_patient_id) %>% unique %>% length %>% paste('unique individuals with CNV')

segs_filtered %>% 
filter(chrom != 23) %>%
count(type) %>% mutate(prop = signif(n/sum(n), 2))

segs_filtered %>%
filter(chrom != 23) %>%
count(dmp_patient_id) %>% count(n) %>% mutate(prop = signif(nn/sum(nn), 2))

cat('Cell fraction ranges:', segs_filtered$phi %>% range %>% signif(2) %>% paste(collapse = '-') %>% paste('\n'))
cat('median cell fraction:', segs_filtered$phi %>% median %>% signif(2) %>% paste('\n'))
cat('median cell fraction:', segs_filtered$phi %>% median %>% signif(2) %>% paste('\n'))
'383 unique events'
A data.table: 2 × 2
auton
<lgl><int>
FALSE 48
TRUE335
'346 unique individuals with CNV'
A data.table: 3 × 3
typenprop
<chr><int><dbl>
amp 560.17
del1560.47
loh1230.37
Storing counts in `nn`, as `n` already present in input
 Use `name = "new_name"` to pick a new name.

A data.table: 4 × 3
nnnprop
<int><int><dbl>
12750.9100
2 210.0700
3 20.0066
4 30.0100
Cell fraction ranges: 0.1-0.9 
meidan cell fraction: 0.32 

cancer types

In [1072]:
signif.num <- function(x, ns = FALSE, pch = TRUE) {
    if (ns) {
        symbols = c("***", "**", "*", ".", "ns")
    } else {
        symbols = c("***", "**", "*", ".", "")
    } 
    
    if (pch) {
        symbols = c(8, 8, 3, 20)
    }
    
    symnum(unlist(x), corr = FALSE, na = FALSE, legend = FALSE,
           cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1), 
           symbols = symbols)
}
In [75]:
fit = glm(data = M_wide, formula = ch_cnv_auto ~ age + Gender + race_b, family = 'binomial')

D = M_wide %>%
    group_by(tumor_type) %>%
    summarise(
        cnv_freq = sum(ch_cnv_auto)/n(),
        cnv_count = sum(ch_cnv_auto),
        sm_freq = sum(ch_nonsilent)/n(),
        n_male = sum(Gender == 'Male'),
        loy_freq = sum(loy[Gender == 'Male'])/n_male,
        n = n(),
        age_median = median(age),
        cnv_freq_pred = sum(predict(fit, data.frame(age = age, Gender = Gender, race_b = race_b), type="response"))/n()
    ) %>%
    mutate(
        cnv_freq_lower = qbinom(p = 0.05, size = n, prob = cnv_freq)/n,
        cnv_freq_upper = qbinom(p = 0.95, size = n, prob = cnv_freq)/n,
        sm_freq_lower = qbinom(p = 0.05, size = n, prob = sm_freq)/n,
        sm_freq_upper = qbinom(p = 0.95, size = n, prob = sm_freq)/n,
        loy_freq_lower = qbinom(p = 0.05, size = n, prob = loy_freq)/n_male,
        loy_freq_upper = qbinom(p = 0.95, size = n, prob = loy_freq)/n_male
    ) %>%
    filter(n > 300) %>%
    arrange(cnv_freq_pred) %>%
#   mutate(tumor_type = paste0(tumor_type, '(', n, ')')) %>%
    mutate(tumor_type = factor(tumor_type, unique(tumor_type)))
    
D = D %>% 
    rowwise() %>%
    mutate(p = prop.test(x = cnv_count, n = n, p = cnv_freq_pred)$p.value) %>%
    mutate(
        q = p.adjust(p, method = 'BH'),
        q.stars = gtools::stars.pval(q)
    ) %>%
    ungroup()

coef = mean(D$sm_freq)/mean(D$cnv_freq)

p1 = D %>% 
    ggplot(
        aes()
    ) +
    theme_classic() +
    geom_point(
        inherit.aes = F,
        aes(x = tumor_type,
            y = cnv_freq_pred
        ),
        color = 'royalblue',
        pch = 21,
        size = 3
    ) +
    geom_pointrange(
        inherit.aes = F,
        aes(x = tumor_type,
            y = cnv_freq, 
            ymin = cnv_freq_lower, 
            ymax = cnv_freq_upper,
#             color = 'CNV-CH'
        ),
        stat = 'identity',
        pch = 5
    ) +
#     geom_text(
#         aes(label = q.stars, x = tumor_type, y = 0.03),
# #         hjust = 1.5,
# #         vjust = -0.2,
#         size = 5
#     ) +
    theme(
        axis.text.x = element_blank(),
        plot.margin = unit(c(0,0,0,0), 'mm')
#         axis.title.x = element_blank()
    ) +
    xlab('') +
    ylab('Proportion with mCA')

give.n <- function(x){
  return(c(y = 100, label = length(x))) 
}

p2 = M_wide %>% filter(tumor_type %in% D$tumor_type) %>%
    mutate(tumor_type = factor(tumor_type, levels(D$tumor_type))) %>%
    ggplot(aes(x = tumor_type, y = age)) +
    theme_classic() +
    geom_violin(draw_quantiles = c(0.5)) +
    theme(
        axis.text.x = element_text(angle = 30, hjust = 1, size = 7),
        plot.margin = unit(c(0,0,0,0), 'mm')
    ) +
    xlab('') +
    stat_summary(fun.data = give.n, geom = "text", vjust = 0, size = 2) +
    scale_y_continuous(expand = expansion(add = 10)) +
    ylab('Age')

panel = p1/p2 + plot_layout(heights = c(2,1))

do_plot(panel, 'cancer_types.pdf', w = 5, h = 4)
`summarise()` ungrouping output (override with `.groups` argument)

Warning message in prop.test(x = cnv_count, n = n, p = cnv_freq_pred):
“Chi-squared approximation may be incorrect”
Warning message in prop.test(x = cnv_count, n = n, p = cnv_freq_pred):
“Chi-squared approximation may be incorrect”
Warning message in prop.test(x = cnv_count, n = n, p = cnv_freq_pred):
“Chi-squared approximation may be incorrect”

Co-mutation

In [812]:
table(M_wide$DNMT3A, M_wide$ch_cnv_auto)
   
        0     1
  0 28483   248
  1  3658    55
In [27]:
table(M_wide$DNMT3A, M_wide$ch_cnv_auto) %>% fisher.test
	Fisher's Exact Test for Count Data

data:  .
p-value = 0.0005483
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 1.257536 2.316470
sample estimates:
odds ratio 
  1.719808 
In [76]:
options(repr.plot.width = 4, repr.plot.height = 2, repr.plot.res = 400)

sm_classes = M_long %>% pull(VariantClass2) %>% unique

colors = c(cnv_pal2, sm_pal) 

heme_colors = c(' ' = 'white', 'CLL' = 'royalblue', 'MDS' = 'tomato3', 'MPN' = 'purple', 'Lymphoma' = 'yellowgreen')

colors = c(colors, heme_colors)

gene_set = c(fread('./data/gene_set.tsv')$Gene, 'PPM1D', 'SRSF2')

# basically the same
gene_set = M_long %>% 
    filter(ch_nonsilent == 1) %>%
    filter(Gene %in% gene_set) %>%
    filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
    count(Gene) %>% filter(n >= 2) %>% pull(Gene)

# gene_set = c('DNMT3A', 'TET2', 'JAK2', 'ASXL1', 'TP53', 'ATM', 'PPM1D', 'CHEK2',
#              'SRSF2', 'SF3B1', 'U2AF1', 'EZH2', 'MPL', 'NOTCH1', 'TERT', 'SUZ12', 'CBL')

chroms = segs_filtered %>% count(chrom) %>%
#     filter(n >= 4) %>% 
    pull(chrom)
# max_rows = 15
max_rows = 22

plots = list()

for (this_chrom in chroms) {
    
    chrom_label = ifelse(this_chrom == 23, 'X', this_chrom)

    segs_auto = segs_filtered %>% filter(chrom == this_chrom)

    M_co = rbind(
            M_long %>% 
            filter(Gene %in% gene_set) %>%
            filter(ch_nonsilent == 1) %>%
            filter(dmp_sample_id %in% segs_auto$dmp_sample_id) %>% 
            mutate(
                mutation = Gene,
                type = VariantClass2,
                arm = 'na',
                focality = 'na'
            ) %>%
            select(mutation, type, arm, focality, dmp_patient_id),
            segs_auto %>% 
            mutate(
                mutation = paste0(arm, type, focality)
            ) %>%
            select(mutation, type = type2, arm, focality, dmp_patient_id)
        ) %>%
        group_by(mutation) %>%
        mutate(mutation_n = n()) %>%
        ungroup()

    D = M_co %>%
        distinct(mutation, dmp_patient_id, `.keep_all` = T) %>%
        group_by(mutation) %>% mutate(mutation_n = n()) %>% ungroup() %>%
        arrange(type %in% sm_classes, desc(mutation_n)) %>%
        mutate(mutation = factor(mutation, rev(unique(mutation)))) %>%
        rowwise() %>%
        mutate(mut_weight = 2^which(mutation == levels(mutation))) %>%
        ungroup() %>%
        group_by(dmp_patient_id) %>% 
        mutate(patient_weight = sum(mut_weight)) %>%
        ungroup() %>%
        arrange(desc(patient_weight), type) %>%
        mutate(pid = as.integer(factor(dmp_patient_id, unique(dmp_patient_id))))
    
    D = D %>% arrange(desc(mutation)) %>%
        mutate(mutation = as.character(mutation)) %>%
        mutate(mutation = str_remove_all(mutation, 'amp|del|loh|broad|focal')) %>%
        mutate(mutation = factor(mutation, rev(unique(mutation))))
    
    n_rows = length(levels(D$mutation))
    n_cols = length(unique(D$pid))
    
    D = D %>% mutate(mutation = factor(mutation, c(as.character(1:(max_rows - n_rows)), levels(mutation))))
    nrows_sm = D %>% filter(type %in% sm_classes) %>% pull(mutation) %>% unique %>% length

    p = ggplot(
        D,
        aes(x = pid, y = mutation, fill = type)
    ) +
    geom_rect(
        xmin = 0, xmax = n_cols + 0.5, ymin = max_rows - n_rows + nrows_sm + 0.5, ymax = max_rows + 0.5,
        fill = 'whitesmoke',
    ) +
    geom_segment(
        aes(x = pid + 0.5, xend = pid + 0.5),
        y = max_rows - n_rows + 0.5,
        yend = max_rows + 0.5,
        color = 'gray90',
        size = 0.3
    ) +
    geom_hline(yintercept = max_rows - n_rows + nrows_sm + 0.5, size = 0.3, color = 'gray90') +
#     geom_hline(yintercept = max_rows + 0.5, size = 0.5, color = 'black') +
#     geom_hline(yintercept = max_rows - n_rows + 0.5, size = 0.5, color = 'black') +
    geom_segment(x = 0.5, xend = 0.5, y = max_rows - n_rows + 0.5, yend = max_rows + 0.5) +
    geom_rect(
        aes(
            xmin = pid - 0.4,
            xmax = pid + 0.4, 
            ymin = case_when(
                focality == 'focal' ~ as.numeric(mutation) - 0.1,
                arm == 'p' ~ as.numeric(mutation),
                type %in% sm_classes ~ as.numeric(mutation) - 0.45,
                T ~ as.numeric(mutation) - 0.45
            ),
            ymax = case_when(
                focality == 'focal' ~ as.numeric(mutation) + 0.1,
                arm == 'p' ~ as.numeric(mutation) + 0.45,
                arm == 'q' ~ as.numeric(mutation),
                arm == 'p,q' ~ as.numeric(mutation) + 0.45,
                type %in% sm_classes ~ as.numeric(mutation) + 0.45
            )
        ),
        size = 0, colour = "white"
    ) +
    # centromeres
    geom_point(
        data = D %>% filter((!type %in% sm_classes) & focality != 'focal'),
        aes(color = type),
        size = 1,
        show.legend = FALSE
    ) +
    theme_classic() +
    theme(
        axis.text.x = element_blank(),
        plot.margin = margin(t = 0, r = 1, b = 0, l = 0, unit = "mm"),
#         panel.spacing = unit(1,'cm'),
        axis.line = element_blank(),
        axis.ticks = element_blank(),
        legend.box = "horizontal",
        plot.title = element_text(size = 10)
    ) +
    guides(
        fill = guide_legend(ncol = 2, title = 'Variant Types'),
        color = FALSE
    ) +
    scale_fill_manual(
        values = colors
    ) +
    scale_color_manual(
        values = colors
    ) +
    scale_x_discrete(
        expand = expansion(add = 0.2)
    ) +
    scale_y_discrete(
        expand = expansion(add = 0.2),
        labels = str_remove_all(levels(D$mutation), 'broad|focal|del|del|loh|amp|^\\d+$'),
        drop = F
    ) +
    ggtitle(chrom_label) +
    ylab('') +
    xlab('')
    
    if (this_chrom != 14) {
        p = p + theme(legend.position = 'none')
    } else {
        p = p + theme(legend.position = 'right')
    }
    
    # patient heme status
#     D_heme = D %>% distinct(dmp_patient_id, pid) %>% 
#         left_join(
#             M_wide %>% select(dmp_patient_id, heme_cat),
#             by = 'dmp_patient_id'
#         ) %>% 
#         mutate(
#             heme_cat = ifelse(heme_cat %in% c('DLBCL', 'FL', 'MZL'), 'Lymphoma', heme_cat)
#         ) %>% filter(heme_cat != '')
        
#     if (nrow(D_heme) != 0) {
#         p = p + geom_point(
#             inherit.aes = F,
#             data = D_heme,
#             mapping = aes(x = pid, fill = heme_cat),
#             y = max_rows - n_rows,
#             pch = 24, size = 2,
#             colour = "white"
#         )
        
#     }

    plots[[this_chrom]] = p
}
In [77]:
plots = plots[lengths(plots) != 0]

plot_widths = segs_filtered %>% 
    distinct(chrom, dmp_patient_id) %>%
    filter(chrom %in% chroms) %>% count(chrom) %>% pull(n)

panel = wrap_plots(plotlist = plots, widths = plot_widths, nrow = 1, guides = 'collect')

do_plot(panel, 'oncoplot.pdf', w = 40, h = 3.5)
In [248]:
18002/32442
0.554897971764996
In [249]:
M_wide %>% count(smoke)
A tibble: 3 × 2
smoken
<fct><int>
no 15103
yes 15767
missing 1572

trans effect

In [80]:
# genes = c('DNMT3A', 'TET2', 'ASXL1', 'EZH2', 'TP53', 'ATM', 'CHEK2', 'PPM1D',
#         'SRSF2', 'SF3B1', 'U2AF1', 'MPL', 'JAK2', 'CBL', 'RUNX1', 'NOTCH1', 'MYD88', 'TERT', 'SUZ12', 'BRCA2', 'KIT', 'IKZF1')

genes = c('DNMT3A', 'TET2', 'ASXL1', 'EZH2', 'TP53', 'ATM', 'CHEK2', 'PPM1D',
        'SRSF2', 'SF3B1', 'U2AF1', 'MPL', 'JAK2', 'TERT')

res = data.frame()

# don't test for all if chrom has predominately one kind of change
single_type_chroms = segs_filtered %>% 
    count(chrom, type) %>% 
    group_by(chrom) %>%
    mutate(total = sum(n)) %>%
    mutate(prop = n/total) %>% 
    filter(prop > 0.7) %>% pull(chrom) %>%
    paste0('chr', .)

chroms_detailed = colnames(CNV_detailed)[-1][(CNV_detailed[,-1] %>% colSums) > 3]
chroms_broad = colnames(CNV)[-1][(CNV[,-1] %>% colSums) > 3]
chroms_broad = chroms_broad[chroms_broad != 'ch_cnv']
chroms_broad = chroms_broad[!(chroms_broad %in% single_type_chroms)]
chroms = c(chroms_broad, chroms_detailed)
# chroms = chroms[str_extract(chroms, '\\d+') != 23]
# chroms = c(chroms, 'loy')

ntest = 0

for (gene in genes) {
    
    display(gene)
    gene_chrom = gene_bed %>% filter(Gene == gene) %>% pull(chrom)
    
    for (chrom in chroms) {
        
#         chroms[str_extract(chroms, '\\d+') != gene_chrom]
        
        ntest = ntest + 1
        
#         D = M_wide %>% filter(!cytopenia)
        D = M_wide
        
        counts = table(D[[gene]], D[[chrom]])
        
        if (ncol(counts) != 1) {
            
            if (chrom == 'chr23_del') {
                counts = table(D[D$Gender == 'Female',][[gene]], D[D$Gender == 'Female',][[chrom]])
            }

            if (chrom == 'loy') {
                counts = table(D[D$Gender == 'Male',][[gene]], D[D$Gender == 'Male',][[chrom]])
            }
            
            co = counts[2,2]

            if (co >= 2) {

                test = fisher.test(counts)
                res = rbind(res,
                    data.frame(
                        gene = gene,
                        chrom = chrom,
                        p = test$p.value,
                        OR = test$estimate,
                        co = co
                    ))
            }
            
        }
        
    }
}

ntest_broad = length(chroms_broad) * length(genes)
ntest_specific = (length(chroms) - length(chroms_broad)) * length(genes)

res = res %>% 
    mutate(category = ifelse(str_detect(chrom, '_'), 'broad', 'specific')) %>%
    mutate(q = ifelse(
        category == 'broad', 
        p.adjust(p, method = 'BH', n = ntest_broad),
        p.adjust(p, method = 'BH', n = ntest_specific)
    )) %>% 
    mutate(chrom = str_remove(chrom, 'chr')) %>%
#     mutate(q = p.adjust(p, method = 'BH', n = ntest)) %>%
    mutate(logOR = log2(OR))
'DNMT3A'
'TET2'
'ASXL1'
'EZH2'
'TP53'
'ATM'
'CHEK2'
'PPM1D'
'SRSF2'
'SF3B1'
'U2AF1'
'MPL'
'JAK2'
'TERT'
In [81]:
ntest_broad
ntest_specific
154
378
In [82]:
limor = 10
width = 0.5
star_dodge = 0.3
key_width = 0.7

ddr = c('TP53', 'PPM1D', 'CHEK2', 'ATM')
splicing = c('SRSF2', 'SF3B1', 'U2AF1')
dna_methylation = c('TET2', 'DNMT3A')
chromatin = c('ASXL1', 'EZH2')

genes = unique(res$gene)

gene_locations = gene_bed %>% filter(Gene %in% genes)

res = res %>% 
    mutate(signif_cat = case_when(
        q < 0.01 ~ 'q<0.01',
        q < 0.05 ~ 'q<0.05',
        q < 0.1 ~ 'q<0.1',
        T ~ 'ns'
    ))

D = res %>% 
    rowwise() %>%
    mutate(
        type = str_split(chrom, '_')[[1]][2],
        chrom = str_split(chrom, '_')[[1]][1]
    ) %>%
    mutate(type = ifelse(is.na(type), 'any', type)) %>%
    mutate(type = ifelse(chrom == 'loy', 'del', type)) %>%
    ungroup() %>%
    mutate(chrom = ifelse(chrom == 23, 'X', as.character(chrom))) %>%
    mutate(
        chrom = factor(chrom, gtools::mixedsort(unique(chrom))),
        gene = factor(gene, rev(genes))
    ) %>%
    mutate(
        chrom_i = as.integer(chrom),
        gene_i = as.integer(gene)
    )

D = D %>% mutate(gene_function = case_when(
    gene %in% dna_methylation ~ 'DNA methylation',
    gene %in% splicing ~ 'Splicing',
    gene %in% chromatin ~ 'Chromatin',
    gene %in% ddr ~ 'DDR',
    T ~ 'Other'
))

p_main = D %>%
    rbind(
        mutate(., 
            chrom_i = case_when(
                type == 'amp' ~ chrom_i - width,
                type == 'loh' ~ chrom_i - width,
                type == 'del' ~ chrom_i + width,
                type == 'any' ~ chrom_i - width,
            ),
            gene_i = case_when(
                type == 'amp' ~ gene_i + width,
                type == 'loh' ~ gene_i + width,
                type == 'del' ~ gene_i + width,
                type == 'any' ~ gene_i - width,
            )
        ),
        mutate(.,
            chrom_i = case_when(
                type == 'amp' ~ chrom_i - width,
                type == 'loh' ~ chrom_i + width,
                type == 'del' ~ chrom_i + width,
                type == 'any' ~ chrom_i + width,
            ), 
            gene_i = case_when(
                type == 'amp' ~ gene_i - width,
                type == 'loh' ~ gene_i + width,
                type == 'del' ~ gene_i - width,
                type == 'any' ~ gene_i - width,
            )
        )
    ) %>%
    rowwise() %>% mutate(OR = min(OR, limor), logOR = log2(OR)) %>% ungroup() %>%
    ggplot(
        aes()
    ) +
    theme_classic() +
    geom_polygon(
        aes(x = chrom_i, y = gene_i, group = paste0(gene, chrom, type), fill = OR),
        color = 'white',
        size = 0.5,
#         alpha = 1
    ) +
    geom_vline(data = D, aes(xintercept = chrom_i + 0.5), color = 'gray95') +
    geom_hline(data = D, aes(yintercept = gene_i + 0.5), color = 'gray95') +
    geom_point(
        aes(x = case_when(
                type == 'loh' ~ as.numeric(chrom_i),
                type == 'del' ~ as.numeric(chrom_i) + star_dodge,
                type == 'amp' ~ as.numeric(chrom_i) - star_dodge,
                type == 'any' ~ as.numeric(chrom_i)
            ),
            y = case_when(
                type == 'loh' ~ gene_i + star_dodge,
                type == 'del' ~ as.numeric(gene_i),
                type == 'amp' ~ as.numeric(gene_i),
                type == 'any' ~ as.numeric(gene_i) - star_dodge
            ),
            pch = signif_cat
        ),
        data = D %>% filter(q < 0.1),
        color = 'black',
        size = 1.3
    ) +
    geom_tile(
        data = gene_locations %>% 
            mutate(
                chrom_i = as.integer(factor(chrom, levels(D$chrom))),
                gene_i = as.integer(factor(Gene, levels(D$gene)))
            ),
        aes(x = chrom_i, y = gene_i, linetype = ' '),
        alpha = 0,
        color = 'black',
        size = 0.5
    ) +
    scale_fill_gradient2(low = 'blue', high = 'indianred2', mid = 'white', na.value = 'white', midpoint = 1, limits = c(0, limor)) +
#     scale_fill_manual(values = c(cnv_pal, 'any' = 'darkorange')) +
    ylab('') +
    xlab('') +
    scale_x_continuous(
        expand = expansion(add = 0.05),
        labels = levels(D$chrom),
        breaks = 1:length(levels(D$chrom)),
        position = "top"
    ) +
    scale_y_continuous(
        expand = expansion(add = 0.05), labels = levels(D$gene), breaks = 1:length(levels(D$gene))
    ) +
    theme(
        plot.margin = unit(c(0,0,0,0), 'mm'),
        axis.text.y = element_text(
            face = 'bold.italic',
            color = case_when(
                as.character(levels(D$gene)) %in% dna_methylation ~ 'royalblue',
                as.character(levels(D$gene)) %in% splicing ~ 'firebrick',
                as.character(levels(D$gene)) %in% chromatin ~ 'darkolivegreen3',
                as.character(levels(D$gene)) %in% ddr ~ 'goldenrod2',
                T ~ 'black'
            )
        ),
        axis.text.x = element_text(
            face = 'bold'
        ),
        legend.key.size = unit(0.3, 'cm'),
        legend.position = 'right',
        panel.border = element_rect(color = 'gray', fill = NA),
        axis.line = element_line(color = 'gray'),
        legend.text = element_text(size = 8),
        legend.title = element_text(size = 8),
        legend.spacing = unit(2, 'mm')
    ) +
    scale_shape_manual(values = c('q<0.01' = 8, 'q<0.05' = 3, 'q<0.1' = 20), name = 'FDR') +
    geom_text(
        data = D %>% distinct(gene, gene_function) %>% mutate(gene_function = relevel(factor(gene_function), 'Other')),
        aes(x = 1, 
            y = 1,
            color = gene_function,
            label = gene),
        size = 0,
        key_glyph = draw_key_rect
    ) +
#     scale_alpha_continuous(range = c(0, 0.5)) +
    scale_color_manual(
        values = c(
            'DNA methylation' = 'royalblue', 
            'Splicing' = 'firebrick', 
            'Chromatin' = 'darkolivegreen3',
            'DDR' = 'goldenrod2',
            'Other' = 'black'
        )
    ) +
    guides(
        color = guide_legend(keyheight = key_width, keywidth = key_width, title = 'Gene function', override.aes = list(size = 3)),
#         alpha = guide_legend(keyheight = key_width, keywidth = key_width),
#         fill = guide_legend(keyheight = key_width, keywidth = key_width, override.aes = list(alpha = 0.5)),
        linetype = guide_legend(title = 'Gene location', keyheight = key_width, keywidth = key_width)
    )

do_plot(p_main, 'trans_heatmap.pdf', w = 6.5, h = 4)
Warning message:
“Vectorized input to `element_text()` is not officially supported.
Results may be unexpected or may change in future versions of ggplot2.”
Warning message:
“Removed 1 rows containing missing values (geom_tile).”
Warning message:
“Removed 1 rows containing missing values (geom_tile).”
In [21]:
segs_filtered_auto = segs_filtered_auto %>% mutate(
        trans_effect = 
            (chrom == 3 & type == 'amp' & TP53) |
            (chrom == 4 & type == 'del' & (!TET2) & (SRSF2 | ATM | CHEK2 | ASXL1)) |
            (chrom == 5 & type == 'del' & TP53) |
            (chrom == 7 & type == 'del' & (TP53 | PPM1D)) |
            (chrom == 8 & type == 'amp' & TET2) |
            (chrom == 12 & type == 'amp' & (NOTCH1 | MYD88 | FBXW7)) |
            (chrom == 13 & type == 'del' & ATM) |
            (chrom == 20 & type == 'del' & (U2AF1 | TERT))
    )

trans_total = sum(segs_filtered_auto$trans_effect)
trans_del = sum(segs_filtered_auto %>% filter(type == 'del') %>% pull(trans_effect))
trans_amp = sum(segs_filtered_auto %>% filter(type == 'amp') %>% pull(trans_effect))

trans_total %>% paste0(' total trans events')
trans_del %>% paste0(' total trans deletions')
trans_amp %>% paste0(' total trans amps')
(trans_total/nrow(segs_filtered_auto)) %>% signif(3) %>% {.*100} %>% paste0('% of total events')
(trans_del/sum(segs_filtered_auto$type == 'del')) %>% signif(3) %>% {.*100} %>% paste0('% of total deletions')
(trans_amp/sum(segs_filtered_auto$type == 'amp')) %>% signif(3) %>% {.*100} %>% paste0('% of total amps')
'32 total trans events'
'21 total trans deletions'
'11 total trans amps'
'9.55% of total events'
'13.5% of total deletions'
'19.6% of total amps'

Genomic distribution

In [12]:
acen = fread('/home/gaot/ref/acen.tsv')
colnames(acen) = c('chrom', 'start', 'end', 'arm', 'type')
chrom_lens = fread('/home/gaot/ref/chrom_lens.tsv') %>% filter(chrom != 24)
chrom_levels = c(as.character(1:22), 'X')

acen = acen %>% 
    mutate(chrom = str_remove(chrom, 'chr')) %>%
    filter(chrom != 'Y') %>%
    mutate(
        chrom = ifelse(chrom == 'X', 23, chrom),
        chrom = as.integer(chrom)
    ) %>%
    arrange(chrom) %>%
    group_by(chrom) %>%
    summarise(cen_start = min(start), cen_end = max(end)) %>%
    ungroup()

cytoband = fread('./cytoBand.txt') %>% setNames(c('chrom', 'start', 'end', 'name', 'stain'))

cytoband = cytoband %>% 
    mutate(chrom = str_remove(chrom, 'chr')) %>%
    mutate(chrom = ifelse(chrom == 'X', 23, chrom)) %>%
    filter(chrom != 'Y') %>%
    mutate(chrom = as.integer(chrom))

cyto_colors = c(
    'gpos100'= rgb(0/255.0,0/255.0,0/255.0),
    'gpos'   = rgb(0/255.0,0/255.0,0/255.0),
    'gpos75' = rgb(130/255.0,130/255.0,130/255.0),
    'gpos66' = rgb(160/255.0,160/255.0,160/255.0),
    'gpos50' = rgb(200/255.0,200/255.0,200/255.0),
    'gpos33' = rgb(210/255.0,210/255.0,210/255.0),
    'gpos25' = rgb(200/255.0,200/255.0,200/255.0),
    'gvar'   = rgb(220/255.0,220/255.0,220/255.0),
    'gneg'  = rgb(255/255.0,255/255.0,255/255.0),
    'acen'  = rgb(217/255.0,47/255.0,39/255.0),
    'stalk' = rgb(100/255.0,127/255.0,164/255.0)
)

window = 5e6

M_overlap = M_long %>%
    filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
    rowwise() %>%
    filter(
        any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
            start >= segs_filtered$start - window & start <= segs_filtered$end + window)
    ) %>%
    ungroup() %>%
    mutate(VariantClass = case_when(
        VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
        T ~ 'coding'
    ))

p_segs = segs_filtered %>%
    mutate(chrom = factor(ifelse(chrom == 23, 'X', as.character(chrom)), chrom_levels)) %>%
    group_by(dmp_sample_id, chrom) %>%
    mutate(total_length = sum(size)) %>%
    mutate(type2 = factor(type2, c('AMP', 'CNLOH', 'DEL'))) %>%
    arrange(type2, round(start/1e7), -total_length) %>%
    group_by(chrom) %>%
    mutate(index = as.integer(factor(dmp_sample_id, unique(dmp_sample_id)))) %>% 
#     left_join(
#         M_overlap %>% select(dmp_patient_id, chrom, sm_pos = start, Gene, VariantClass, VAF_N),
#         by = c('chrom', 'dmp_patient_id')
#     ) %>%
    ungroup() %>%
    ggplot(
        aes(x = start, xend = end, y = index, yend = index, color = type2)
    ) +
    geom_rect(
        inherit.aes = F,
        data = chrom_lens %>% mutate(chrom = factor(ifelse(chrom == 23, 'X', as.character(chrom)), chrom_levels)),
        aes(xmin = 0, xmax = length, ymin = -2, ymax = -0.5),
        color = 'black',
        fill = 'white',
        size = 1
    ) +
    geom_rect(
        data = cytoband %>% mutate(chrom = factor(ifelse(chrom == 23, 'X', as.character(chrom)), chrom_levels)),
        inherit.aes = F,
        aes(xmin = start, xmax = end, ymin = -2, ymax = -0.5, fill = stain),
        size = 1,
        show.legend = FALSE
    ) +
    geom_segment(
        lineend = 'round',
        size = 1
    ) +
    scale_fill_manual(values = cyto_colors) +
    theme_void() +
    theme(
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        legend.position = 'right',
        panel.spacing.y = unit(0, "lines"),
        plot.margin = unit(c(0,0,0,0), "lines"),
        strip.text.y = element_text(size = 10, hjust = 0.5, angle = 0),
        panel.border = element_blank(),
        strip.background = element_blank()
    ) +
    scale_y_discrete(expand = expansion(add = 1)) +
    scale_x_discrete(expand = c(0.02,0)) +
    facet_grid(.~chrom, space = 'free', scales = 'free', switch="both") +
    xlab('') +
    ylab('') +
    scale_color_manual(values = cnv_pal2) +
    labs(color = '')

do_plot(p_segs, 'genome_wide.pdf', w = 20, h = 2.2)
`summarise()` ungrouping output (override with `.groups` argument)

In [59]:
cnv_pal2
CNLOH
'darkgreen'
AMP
'darkblue'
DEL
'darkred'
In [1007]:
impact_bed = fread('/work/isabl/data/techniques/51/bed_files/GRCh37/dna-td-impact-468-grch37.targets.bed.gz')
In [1014]:
colnames(impact_bed) = c('chrom', 'start', 'end', 'strand', 'target')
impact_bed = impact_bed %>% filter(chrom != 'Y')
impact_bed = impact_bed %>% mutate(chrom = as.integer(ifelse(chrom == 'X', 23, chrom)))

CDRs

In [119]:
focal_dels = function(gene) {
    
    all_colors = c(
        'amp' = 'darkblue', 'loh' = 'darkgreen', 'del' = 'darkred',
        'Regulatory' = 'darkturquoise', 'Missense' = 'salmon', 'Truncating' = 'purple') 
    
    gene_structure = gene_structures %>% filter(str_detect(description, paste0('"', gene, '"')))

    gene_loc = gene_bed %>% 
        filter(Gene == gene) %>%
        as.data.frame(stringsAsFactors = F) %>%
        mutate(loc = as.integer((start + end) / 2), chrom = as.integer(chrom))

    this_chrom_lens = chrom_lens %>% filter(chrom == gene_loc$chrom)

    acen_chrom = acen %>% filter(chrom == gene_loc$chrom)

    D = segs_filtered %>%
        filter(type == 'del' & start < gene_loc$start + 5e6 & end > gene_loc$end - 5e6) %>%
        filter(chrom == gene_loc$chrom) %>%
        mutate(chrom = factor(chrom)) %>%
        group_by(dmp_sample_id, chrom) %>%
        mutate(total_length = sum(size)) %>%
        mutate(type = factor(type, c('amp', 'loh', 'del'))) %>%
        arrange(desc(type), round(start/1e7), -total_length) %>%
        group_by(chrom) %>%
        mutate(index = as.integer(factor(dmp_sample_id, unique(dmp_sample_id)))) %>%
        ungroup()

    p_segs = ggplot(
            D,
            aes(x = start, xend = end, y = index, yend = index, color = type)
        ) +
        geom_rect(
            inherit.aes = F,
            data = acen_chrom,
            aes(xmin = cen_start, xmax = cen_end, ymin = -Inf, ymax = Inf),
            color = 'gray',
            size = 0,
            alpha = 0.3
        ) +
        geom_segment(
            size = 1,
            alpha = 0.8,
            lineend = 'round'
        ) +
        geom_segment(
            inherit.aes = F,
            data = this_chrom_lens,
            aes(x = 0, xend = length, y = 0, yend = 0),
            alpha = 0,
            size = 0,
            color = 'white'
        ) +
        scale_alpha(
            range = c(0, 1)
        ) +
        theme_classic() +
        theme(
            axis.text = element_blank(),
            axis.ticks = element_blank(),
            axis.title = element_blank(),
            legend.position = 'none',
            panel.spacing.y = unit(1, "mm"),
            plot.margin = unit(c(0.5,0.2,0.2,0.2), "lines"),
            strip.text.y = element_blank(),
            panel.border = element_rect(size = 0.2, fill = NA, color = 'gray'),
            strip.background = element_rect(size = 0.5, fill = 'seashell3', color = 'seashell3'),
            plot.title = element_text(size = 7)
        ) +
        scale_y_discrete(expand = expansion(add = 1)) +
        xlab('') +
        ylab('') +
        scale_color_manual(values = all_colors) +
        ggtitle(paste0(gene, ', ', 'chr', gene_loc$chrom)) +
        # exon structures
        geom_segment(
            inherit.aes = F,
            data = gene_structure %>% mutate(zoom = TRUE),
            aes(x = min(start), xend = max(end), y = -1, yend = -1),
            size = 0.1,
            color = 'black'
        ) +
        geom_segment(
            inherit.aes = F,
            data = gene_structure %>% mutate(zoom = TRUE),
            aes(x = start, xend = end, y = -1, yend = -1),
            size = 2,
            color = 'black'
        ) +
        facet_zoom(
            zoom.size = 1,
            zoom.data = zoom,
            xlim = c(gene_loc$start, gene_loc$end)
        )

    p_segs
}
In [120]:
focal_genes = c('DNMT3A', 'TET2', 'ATM', 'ETV6', 'NF1', 'CHEK2')

plots = list()

for (gene in focal_genes) {
    plots[[gene]] = focal_dels(gene)
}

p_focal = plot_grid(plotlist = plots, nrow = 2)

do_plot(p_focal, 'focal_del.pdf', w = 4, h = 4)

Co-localization

In [15]:
window = 5e6

segs_filtered_auto = segs_filtered %>% filter(chrom != 23)

M_overlap = M_long %>%
    filter(dmp_patient_id %in% segs_filtered_auto$dmp_patient_id) %>%
    rowwise() %>%
    mutate(
        locality = ifelse(
            any(chrom == segs_filtered_auto$chrom & dmp_patient_id == segs_filtered_auto$dmp_patient_id &
                start >= segs_filtered_auto$start - window & start <= segs_filtered_auto$end + window
            ),
            'cis',
            'trans'
        )
    ) %>%
    ungroup() %>%
    mutate(variant_type = case_when(
        VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
        T ~ 'coding'
    ))

perm = fread('./data/permutation.tsv', sep = '\t')

# perm = data.frame()

# for (i in 1:2000) {
    
#     if (i %% 500 == 0) {
#         display(i)
#     }
    
#     res = M_overlap %>%
#         mutate(dmp_patient_id = sample(dmp_patient_id)) %>%
#         rowwise() %>%
#         mutate(
#             locality = ifelse(
#                 any(chrom == segs_filtered_auto$chrom & dmp_patient_id == segs_filtered_auto$dmp_patient_id &
#                     start >= segs_filtered_auto$start - window & start <= segs_filtered_auto$end + window
#                 ),
#                 'cis',
#                 'trans'
#             )
#         ) %>%
#         ungroup() %>%
#         group_by(variant_type) %>%
#         summarise(
#             prop_cis = sum(locality == 'cis')/n(),
#             .groups = 'drop'
#         ) %>%
#         mutate(i = i)
    
#     perm = perm %>% rbind(res)
# }
In [89]:
prop_cis_coding = M_overlap %>% filter(variant_type == 'coding') %>% pull(locality) %>% {sum(. == 'cis')/length(.)}
prop_cis_noncoding = M_overlap %>% filter(variant_type == 'noncoding') %>% pull(locality) %>% {sum(. == 'cis')/length(.)}
prop_coding_nonrecurrent = M_overlap %>%
    filter(variant_type == 'coding') %>%
    filter(!Gene %in% c('MPL', 'DNMT3A', 'TET2', 'EZH2', 'JAK2', 'ATM', 'TP53')) %>% 
    pull(locality) %>% {sum(. == 'cis')/length(.)}
    
prop_cis_coding %>% paste('cis coding')
prop_coding_nonrecurrent %>% paste('cis coding non recurrent')
prop_cis_noncoding %>% paste('cis noncoding')
'0.150793650793651 cis coding'
'0.0380952380952381 cis coding non recurrent'
'0.0214285714285714 cis noncoding'
In [189]:
p_coding = perm %>% filter(variant_type == 'coding') %>% pull(prop_cis) %>% {sum(. > prop_cis_coding)/length(.)}
p_noncoding = perm %>% filter(variant_type == 'noncoding') %>% pull(prop_cis) %>% {sum(. > prop_cis_noncoding)/length(.)}

D = M_overlap %>% 
    group_by(variant_type) %>%
    summarise(
        prop_cis_median = sum(locality == 'cis')/n()
    ) %>%
    mutate(
        prop_cis_lower = prop_cis_median, 
        prop_cis_upper = prop_cis_median,
        estimate = 'observed'
    ) %>%
    rbind(
        perm %>% 
        group_by(variant_type) %>%
        summarise(
            prop_cis_lower = quantile(prop_cis, c(.05))[1],
            prop_cis_upper = quantile(prop_cis, c(.95))[1],
            prop_cis_median = median(prop_cis)
        ) %>%
        mutate(
            estimate = 'expected'
        )
    ) %>%
    mutate(
        variant_type = Hmisc::capitalize(variant_type)
    )


p_perm = ggplot(
        D,
        aes(x = variant_type, y = prop_cis_median, ymin = prop_cis_lower, ymax = prop_cis_upper, pch = estimate)
    ) +
    geom_pointrange() +
    theme_classic() +
    scale_shape_manual(values=c(16, 5)) +
    theme(legend.position = 'right') +
    ylab('Proportion of cis locality') +
    ylim(0, 0.18) +
    xlab('') +
    # stat_compare_means(
    #     comparisons = list(c('coding', 'noncoding')),
    #     label.y = 0.14,
    #     size = 0
    # )
    annotate('text', x = 1, y = 0.18, label = glue('p<0.001'), size = 3) +
    annotate('text', x = 2, y = 0.18, label = glue('p={signif(p_noncoding, 2)}'), size = 3) +
    theme(legend.title = element_blank())

do_plot(p_perm, 'permutation.pdf', w = 3, h = 2)
`summarise()` ungrouping output (override with `.groups` argument)

`summarise()` ungrouping output (override with `.groups` argument)

general characterstics

In [90]:
options(repr.plot.width = 4, repr.plot.height = 2.5, repr.plot.res = 300)

p_dev = ggplot(
  segs_filtered,
  aes(x = valor, y = cnlr_adj, color = type)
) +
scale_color_manual(values = cnv_pal) +
# geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.8, color = 'darkgreen') +
geom_point(pch = 21, stroke = 0.5) +
scale_x_continuous(limits = c(0, 2.5), expand = c(0,0)) +
scale_y_continuous(limits = c(-1, 0.6), expand = c(0,0)) +
theme_classic() +
theme(
    legend.position = 'none',
    axis.title = element_text(size = 8)
) +
# geom_text(size = 2) +
geom_line(
  inherit.aes = F,
  data = del_line,
  aes(x = valor, y = cnlr),
  linetype = 'dashed', alpha = 0.8, color = 'darkred'
) +
geom_line(
  inherit.aes = F,
  data = amp_line,
  aes(x = valor, y = cnlr),
  linetype = 'dashed', alpha = 0.8, color = 'darkblue'
) +
geom_line(
  inherit.aes = F,
  data = loh_line,
  aes(x = valor, y = cnlr),
  linetype = 'dashed', alpha = 0.8, color = 'darkgreen'
) +
xlab('logOR') +
ylab('logR') + 
annotate('text', x = 0.9, y = 0.5, label = 'AMP', color = cnv_pal['amp'], size = 3) +
annotate('text', x = 2.1, y = 0.2, label = 'CNLOH', color = cnv_pal['loh'], size = 3) +
annotate('text', x = 2, y = -0.5, label = 'DEL', color = cnv_pal['del'], size = 3)
# annotate('text', x = 0.2, y = -0.8, label = paste0('n=', nrow(segs_filtered)), color = 'gray30', size = 3)

p_dev
Warning message:
“Removed 6 row(s) containing missing values (geom_path).”
In [91]:
options(repr.plot.width = 5, repr.plot.height = 2, repr.plot.res = 300)

p_age = ggplot(
    M_wide %>% 
        mutate(age_bin = cut(age, c(0, seq(40, 80, 10), 100))) %>%
        group_by(age_bin) %>%
        summarise(
            prop_ch_cnv = sum(ch_cnv)/n(),
            prop_ch_cnv_auto = sum(ch_cnv_auto)/n(),
            prop_both_ch = sum(ch_cnv & ch_nonsilent)/n(),
            n = n()
        ),
    aes(x = age_bin, y = prop_ch_cnv, group = '1')
) +
geom_line() +
theme_classic() +
geom_point() +
ylim(0, 0.04) +
theme(
    axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1, size = 8),
    axis.title = element_text(size = 8)
) +
ylab('Proportion with mCA')

p_age
`summarise()` ungrouping output (override with `.groups` argument)

In [92]:
options(repr.plot.width = 3, repr.plot.height = 3, repr.plot.res = 300)

give.min <- function(x){
  return(c(y = min(x), label = min(x))) 
}

p_phi = ggplot(
        segs_filtered,
        aes(x = type2, y = phi, fill = type2, color = type2)
    ) +
#     geom_hline(yintercept = 0.1, linetype = 'dashed', color = 'gray', size = 0.5, alpha = 0.8) +
    geom_violin(size = 0.3, position = position_dodge(width = 0.75), alpha = 0.2, scale = 'width') +
    theme_classic() +
    geom_quasirandom(width = 0.1, method = 'smiley', size = 0.5) +
    scale_fill_manual(values = cnv_pal2) +
    scale_color_manual(values = cnv_pal2) +
    ylim(0,NA) +
    theme(
        legend.position = 'none',
        axis.title = element_text(size = 8),
        axis.text.x = element_text(size = 8, angle = 30, hjust = 1, vjust = 1)
    ) +
    ylab('Aberrant cell fraction') +
    xlab('Event type') +
    stat_summary(fun.data = give.min, geom = "text", vjust = 1.5, size = 2.5)

p_phi
In [93]:
options(repr.plot.width = 1.5, repr.plot.height = 2, repr.plot.res = 300)

p_n = ggplot(
        segs_filtered %>% group_by(dmp_sample_id) %>% mutate(n_sample = n()) %>%
        ungroup %>% count(n_sample, active_heme, type) %>% mutate(n = n/n_sample) %>%
            mutate(n_sample = case_when(
                n_sample < 3 ~ as.character(n_sample), T ~ '3+')),
        aes(x = n_sample, y = n, fill = type)
    ) +
    theme_classic() +
    geom_bar(stat = 'identity', position = 'stack') +
    scale_fill_manual(values = cnv_pal) +
    theme(
        legend.position = 'none',
        axis.title = element_text(size = 8)
    ) +
    xlab('Events per sample')

p_n
In [94]:
panel = p_age + p_n + p_dev + p_phi + plot_layout(widths = c(1.8,1))

do_plot(panel, 'ai_general.pdf', w = 3.8, h = 3.3)
Warning message:
“Removed 6 row(s) containing missing values (geom_path).”
Warning message:
“Removed 6 row(s) containing missing values (geom_path).”
In [226]:
options(repr.plot.width = 5, repr.plot.height = 2, repr.plot.res = 300)

pval = 0.01
label_size = 8

p_mutnum = ggplot(
    M_wide %>% 
        mutate(mutnum_silent = mutnum_silent + mutnum_intronic) %>%
        mutate(ch_cnv = ifelse(ch_cnv == 1, 'mCA+', 'mCA-')) %>%
        reshape2::melt(
            measure.vars = c('mutnum_all', 'mutnum_nonsilent', 'mutnum_silent'),
            variable.name = 'mutation_type',
            value.name = 'mutnum'
        ) %>%
        filter(mutation_type == 'mutnum_all') %>%
        mutate(mutnum = ifelse(mutnum >= 3, '3+', as.character(mutnum))) %>%
        count(ch_cnv, mutnum, mutation_type) %>%
        group_by(ch_cnv) %>%
        mutate(prop = n/sum(n)) %>%
        ungroup() %>%
        filter(mutnum > 0),
    aes(x = ch_cnv, y = prop, fill = mutnum)
) +
geom_col(color = 'black', size = 0.2) +
theme_classic() +
scale_fill_manual(values = scales::seq_gradient_pal("white", sm_col)(c(0.3, 0.6, 1))) +
scale_y_continuous(expand = expansion(add = 0.05), limits = c(0,0.85)) +
stat_compare_means(
    comparisons = list(c('mCA+', 'mCA-')),
    label.y = 0.8,
    size = 0,
    method = 't.test'
) +
annotate('text', x = 1.5, y = 0.8, label = '****', size = 3) +
xlab('') +
theme(
    legend.key.size = unit(3, 'mm'),
    legend.title = element_text(size = 7),
    legend.position = 'top',
    legend.spacing = unit(0,'mm'),
    axis.text.x = element_text(size = 8, angle = 30, hjust = 1)
) +
guides(fill = guide_legend(title.position = "top"))

p_vaf = ggplot(
        M_wide %>% 
            rowwise() %>% mutate(VAF_silent = max(VAF_silent, VAF_intronic)) %>% ungroup() %>%
            mutate(ch_cnv = ifelse(ch_cnv == 1, 'mCA+', 'mCA-')) %>%
            reshape2::melt(
                measure.vars = c('VAF_all', 'VAF_nonsilent', 'VAF_silent'),
                variable.name = 'mutation_type',
                value.name = 'VAF'
            ) %>% 
            filter(VAF > 0 & VAF < 0.5 & mutation_type == 'VAF_all'),
        aes(x = ch_cnv, y = VAF, fill = ch_cnv)
    ) +
    geom_violin(draw_quantiles = c(0.5)) +
    theme_classic() +
    stat_compare_means(
        size = 2.5,
        method = 't.test', 
        label = 'p.signif', 
        comparisons = list(c('mCA+', 'mCA-'))) +
    ylim(0,0.55) +
    theme(
        axis.text.x = element_text(size = 8, angle = 30, hjust = 1),
        legend.position = 'none'
    ) +
    xlab('') +
    scale_fill_manual(values = c(cnv_col, 'gainsboro'))

D = M_wide %>%
    mutate(ch_status = case_when(
        ch_cnv & ch_nonsilent ~ 'Both',
        ch_cnv == 1 ~ 'mCA only',
        ch_nonsilent == 1 ~ 'GM only',
        T ~ 'none'
    )) %>%
    group_by(ch_status) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    filter(ch_status != 'none')


give.n <- function(x){
  return(c(y = min(x), label = length(x))) 
}

p_age = ggplot(
        D,
        aes(x = ch_status, y = age, fill = ch_status)
    ) +
#     geom_violin(draw_quantiles = c(0.5)) +
    geom_boxplot(outlier.alpha = 0) +
    theme_classic() +
    stat_compare_means(
        comparisons = list(
            c('mCA only', 'GM only'),
            c('Both', 'GM only'),
            c('mCA only', 'Both')
        ),
        size = 2.2,
        method = 't.test', label = 'p.signif',
        vjust = 0,
        step.increase = 0.15
    ) +
    ylim(NA, 130) +
    theme(
#         plot.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"),
#         axis.title.x = element_blank()
        axis.text.x = element_text(size = 8, angle = 30, hjust = 1),
        legend.position = 'none'
    ) +
    xlab('') +
    scale_fill_manual(values = c('mCA only' = cnv_col, 'Both' = 'darkorange', 'GM only' = sm_col)) +
    ylab('Age')

# p_phi = ggplot(
#         segs_filtered %>% 
#             mutate(ch_nonsilent = ifelse(ch_nonsilent == 1, 'GM+', 'GM-')),
#         aes(x = ch_nonsilent, y = phi, fill = ch_nonsilent)
#     ) +
#     geom_violin(
#         draw_quantiles = c(0.5),
#         scale = 'width',
#         trim = T
#     ) +
#     theme_classic() +
#     stat_compare_means(
#         size = 2.5,
#         method = 't.test',
#         label = 'p.value',
#         label.x = 1.5,
#         comparisons = list(c('GM+', 'GM-'))
#     ) +
#     ylim(0,1) +
#     xlab('') +
#     ylab('Cell fraction') +
#     theme(
#         axis.text.x = element_text(size = 8, angle = 30, hjust = 1),
#         legend.position = 'none'
#     ) +
#     xlab('') +
#     scale_fill_manual(values = c(sm_col, 'gainsboro'))

do_plot(p_age, 'age.pdf', w = 1.5, h = 1.8)
In [96]:
options(repr.plot.width = 6, repr.plot.height = 2, repr.plot.res = 300)

ggplot(
    segs_filtered %>% 
        mutate(age_bin = cut(age, c(20, seq(40, 80, 10), Inf))) %>%
        group_by(age_bin, type) %>%
        summarise(
            ci_lower = mean(phi) - sd(phi),
            ci_upper = mean(phi) + sd(phi),
            phi = mean(phi),
            n = n()
        ),
    aes(x = age_bin, y = phi, ymin = ci_lower, ymax = ci_upper, group = type, color = type, fill = type)
) +
geom_line() +
theme_classic() +
geom_point() +
# geom_errorbar(size = 0.5, width = 0.1) +
geom_ribbon(alpha = 0.2, size = 0) +
scale_color_manual(values = c('loh' = 'darkgreen', 'amp' = 'darkblue', 'del' = 'darkred', 'err' = 'gray')) +
scale_fill_manual(values = c('loh' = 'darkgreen', 'amp' = 'darkblue', 'del' = 'darkred', 'err' = 'gray')) +
facet_wrap(~type) +
theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5),
    strip.background = element_blank()
)
`summarise()` regrouping output by 'age_bin' (override with `.groups` argument)

Comparison with other studies

In [8]:
segs_laurie = fread('./data/external/segs_laurie.txt')

segs_laurie = segs_laurie %>% 
    mutate(
        size = length * 1e6,
        type = c('aupd' = 'loh', 'gain' = 'amp', 'loss' = 'del')[mosaic.type],
        start = `Start (GRCh37)`,
        end = `End (GRCh37)`,
        chrom = chromosome,
        patient_id = as.character(subject.id)
    ) %>%
    mutate(n_total = 50222)

segs_loh = fread('./data/external/segs_loh.txt') %>%
    filter(COPY_CHANGE != 'undetermined') %>%
    mutate(
        chrom = as.integer(ifelse(CHR == 'X', 23, CHR)),
        size = SIZE_MB * 1e6,
        start = `Start (GRCh37)`,
        end = `End (GRCh37)`,
        type = c('neutral' = 'loh', 'loss' = 'del', 'gain' = 'amp')[COPY_CHANGE],
        phi = as.numeric(CELL_FRAC),
        patient_id = as.character(ID)
    ) %>%
    mutate(n_total = 151202)

segs_jacobs = fread('./data/external/segs_jacobs.txt') %>%
    mutate(
        chrom = CHROM,
        start = `Start (GRCh37)`,
        end = `End (GRCh37)`,
        size = SEG_SIZE,
        type = c('NEUTRAL' = 'loh', 'LOSS' = 'del', 'GAIN' = 'amp')[STATE],
        phi = PROPORTION_ABNORMAL,
        patient_id = as.character(ID)
    ) %>%
    mutate(n_total = 31717 + 26136)

segs_machiela = fread('./data/external/segs_machiela.txt') %>%
    mutate(
        chrom = as.integer(CHR),
        start = `Start (GRCh37)`,
        end = `End (GRCh37)`,
        size = SEG_SIZE,
        type = c('Neutral' = 'loh', 'Loss' = 'del', 'Gain' = 'amp')[STATE],
        phi = PERCENT,
        age = as.numeric(AGE),
        patient_id = as.character(ID)
    ) %>%
    select(-AGE, -CHR) %>%
    filter(!is.na(type)) %>%
    mutate(n_total = 24849)
    
segs_all = bind_rows(
        segs_filtered %>% mutate(study = 'This study') %>%
            mutate(n_total = 32444) %>%
            mutate(patient_id = dmp_patient_id),
        segs_loh %>% mutate(study = 'Loh'),
        segs_laurie %>% mutate(study = 'Laurie'),
        segs_jacobs %>% mutate(study = 'Jacobs'),
        segs_machiela %>% mutate(study = 'Machiela')
    ) %>% 
    mutate(study_label = paste0(study, '\n(n=', n_total, ')')) %>%
    mutate(study_label = factor(study_label, unique(study_label))) %>%
    mutate(study = factor(study, unique(study))) %>%
    mutate(type2 = factor(c('del' = 'DEL', 'loh' = 'CNLOH', 'amp' = 'AMP')[type], c('DEL', 'CNLOH', 'AMP')))
In [9]:
study_info = data.frame(
    'Study' = c('This study', 'Loh et al.', 'Laurie et al.', 'Jacobs et al.', 'Machiela et al. (TSGII)', 'Jaiswal et al.', 'Genovese et al.', 'Zink et al.', 'Terao et al.', 'Loh et al.'),
    'Year' = c(2020, 2018, 2012, 2012, 2015, 2014, 2014, 2017, 2020, 2020),
    'Genotyping Platform' = c('Targeted Sequencing', 'SNP-array', 'SNP-array', 'SNP-array', 'SNP-array', 'WES', 'WES', 'WGS', 'SNP-array', 'SNP-array/WES(partial)'),
    'Mutation Type' = c('mCA/GM(468 genes)', 'mCA', 'mCA', 'mCA', 'mCA', 'GM', 'GM', 'GM', 'mCA', 'mCA/GM(3 genes)'),
    'Sample Size' = c('32,442', '151,202', '50,222', '57,853', '24,849', '17,182', '12,380', '11,262', '179,417', '482,789/49,960'),
    'Cohort Type' = c('Cancer', 'Normal', 'Normal', 'Cancer/Normal', 'Cancer/Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal'),
    'Haplotype Info' = c('No', 'Yes', 'No', 'No', 'No', '-', '-', '-', 'Yes', 'Yes'),
    check.names = FALSE
) %>%
arrange(Year, Study == 'This study')

kb = study_info %>%
    kable(format = "html", booktabs = T) %>%
    row_spec(0:nrow(study_info), background = 'white', align = 'center', extra_css = "border-bottom: 1px solid") %>%
    row_spec(0, background = 'gray', extra_css = "border-bottom: 2px solid") %>%
    kable_styling(position = 'center', full_width = FALSE)

kb %>% toString() %>%
display_html()

#save_kable(kb, './figures/studies.pdf')
Study Year Genotyping Platform Mutation Type Sample Size Cohort Type Haplotype Info
Laurie et al. 2012 SNP-array mCA 50,222 Normal No
Jacobs et al. 2012 SNP-array mCA 57,853 Cancer/Normal No
Jaiswal et al. 2014 WES GM 17,182 Normal -
Genovese et al. 2014 WES GM 12,380 Normal -
Machiela et al. (TSGII) 2015 SNP-array mCA 24,849 Cancer/Normal No
Zink et al. 2017 WGS GM 11,262 Normal -
Loh et al. 2018 SNP-array mCA 151,202 Normal Yes
Terao et al. 2020 SNP-array mCA 179,417 Normal Yes
Loh et al. 2020 SNP-array/WES(partial) mCA/GM(3 genes) 482,789/49,960 Normal Yes
This study 2020 Targeted Sequencing mCA/GM(468 genes) 32,442 Cancer No
In [10]:
give.n <- function(x){
  return(c(y = max(x), label = length(x))) 
}

p = segs_all %>%
    mutate(size = size/1e6) %>%
    filter(chrom != '23') %>%
    filter(phi >= 0.1 | is.na(phi)) %>%
    ggplot(
        aes(x = type2, y = size, fill = type2)
    ) +
    geom_violin(alpha = 0.8, trim = F, draw_quantiles = c(0.5), scale = 'width', width = 0.8) +
    theme_classic() +
    scale_fill_manual(values = cnv_pal2) +
    scale_color_manual(values = cnv_pal2) +
    scale_y_continuous(trans = 'log10', limits = c(0.05, NA)) +
    facet_wrap(~study, nrow = 1) +
    # stat_summary(fun.data = give.n, geom = "text", hjust = 1.5, vjust = -1, size = 2.5) +
    theme(panel.spacing = unit(1, 'mm'), legend.position = 'none') +
    ylab('size in Mb') +
    xlab('Event type')

do_plot(p, 'size.pdf', w = 8, h = 2.2)
Warning message:
“Removed 7 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 234 rows containing missing values (geom_violin).”
Warning message:
“Removed 7 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 234 rows containing missing values (geom_violin).”
In [99]:
give.min <- function(x){
  return(c(y = min(x), label = round(min(x), 2))) 
}

p = segs_all %>%
    filter(!is.na(phi)) %>%
    filter(study != 'Loh') %>%
    ggplot(
        aes(x = type2, y = phi, fill = type2, color = type2)
    ) +
    geom_violin(size = 0.3, position = position_dodge(width = 0.75), alpha = 0.2, scale = 'width') +
    theme_classic() +
    geom_quasirandom(width = 0.1, method = 'smiley', size = 0.5) +
    scale_fill_manual(values = cnv_pal2) +
    scale_color_manual(values = cnv_pal2) +
    scale_y_continuous(expand = expansion(add = 0.15)) +
    theme(
        axis.title = element_text(size = 8)
    ) +
    # scale_y_continuous()
    ylab('Aberrant cell fraction') +
    xlab('Event type') +
    stat_summary(fun.data = give.min, geom = "text", vjust = 1.5, size = 2.5) +
    facet_wrap(~study, nrow = 1) +
    theme(panel.spacing = unit(2, 'mm'), legend.position = 'none')

do_plot(p, 'cell_frac.pdf', w = 5, h = 2.2)
In [100]:
p = segs_all %>%
    filter(chrom != '23') %>%
    filter(phi >= 0.1 | is.na(phi)) %>%
    group_by(study, chrom, type2) %>%
    summarise(
        n = length(unique(patient_id)),
        prop = n/unique(n_total),
        .groups = 'drop'
    ) %>%
    tidyr::complete(chrom, type2, study, fill = list(n = 0, prop = 0)) %>% 
    ggplot(
        aes(x = type2, y = prop, fill = study)
    ) +
    scale_x_discrete(drop=FALSE, expand = expansion(add = 0.5)) +
    geom_col(width = 0.5, position = position_dodge(width = 0.5)) +
    facet_wrap(~chrom, scale = 'free') +
    scale_y_continuous(labels = function(x) format(x, scientific = TRUE)) +
    theme_classic() +
    theme(
        axis.text.y = element_text(size = 6)
    ) +
    ylab('Proportion of subjects') +
    xlab('Event type')
                   
do_plot(p, 'comparison_incidence.pdf', w = 11, h = 6.5)
In [13]:
acen = fread('/home/gaot/ref/acen.tsv')
colnames(acen) = c('chrom', 'start', 'end', 'arm', 'type')
chrom_lens = fread('/home/gaot/ref/chrom_lens.tsv') %>% filter(chrom != 24)

acen = acen %>% 
    mutate(chrom = str_remove(chrom, 'chr')) %>%
    filter(chrom != 'Y') %>%
    mutate(
        chrom = ifelse(chrom == 'X', 23, chrom),
        chrom = as.integer(chrom)
    ) %>%
    arrange(chrom) %>%
    group_by(chrom) %>%
    summarise(cen_start = min(start), cen_end = max(end), .groups = 'drop') %>%
    ungroup()

cytoband = fread('./cytoBand.txt') %>% setNames(c('chrom', 'start', 'end', 'name', 'stain'))

cytoband = cytoband %>% 
    mutate(chrom = str_remove(chrom, 'chr')) %>%
    mutate(chrom = ifelse(chrom == 'X', 23, chrom)) %>%
    filter(chrom != 'Y') %>%
    mutate(chrom = as.integer(chrom))

cyto_colors = c(
    'gpos100'= rgb(0/255.0,0/255.0,0/255.0),
    'gpos'   = rgb(0/255.0,0/255.0,0/255.0),
    'gpos75' = rgb(130/255.0,130/255.0,130/255.0),
    'gpos66' = rgb(160/255.0,160/255.0,160/255.0),
    'gpos50' = rgb(200/255.0,200/255.0,200/255.0),
    'gpos33' = rgb(210/255.0,210/255.0,210/255.0),
    'gpos25' = rgb(200/255.0,200/255.0,200/255.0),
    'gvar'   = rgb(220/255.0,220/255.0,220/255.0),
    'gneg'  = rgb(255/255.0,255/255.0,255/255.0),
    'acen'  = rgb(217/255.0,47/255.0,39/255.0),
    'stalk' = rgb(100/255.0,127/255.0,164/255.0)
)

window = 1e7

M_overlap = M_long %>%
    filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
    rowwise() %>%
    filter(
        any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
            start >= segs_filtered$start - window & start <= segs_filtered$end + window)
    ) %>%
    ungroup() %>%
    mutate(VariantClass = case_when(
        VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
        T ~ 'coding'
    ))

p_segs = segs_all %>%
#     arrange(study != 'Loh') %>%
    mutate(study = factor(study, unique(study))) %>%
    filter(phi >= 0.1 | is.na(phi)) %>%
    filter(chrom != 23) %>%
    group_by(patient_id, chrom) %>%
    mutate(total_length = sum(size)) %>%
    mutate(type2 = factor(type2, c('AMP', 'CNLOH', 'DEL'))) %>%
    arrange(type2, round(start/1e7), -total_length) %>%
    group_by(chrom, study) %>%
    mutate(index = as.integer(factor(patient_id, unique(patient_id)))) %>% 
    filter(index < 100) %>%
    ungroup() %>%
    ggplot(
        aes(x = start, xend = end, y = index, yend = index, color = type2)
    ) +
    geom_rect(
        inherit.aes = F,
        data = chrom_lens %>% filter(chrom != 23),
        aes(xmin = 0, xmax = length, ymin = -2, ymax = -0.5),
        color = 'black',
        fill = 'white',
        size = 1
    ) +
    geom_rect(
        data = cytoband %>% filter(chrom != 23),
        inherit.aes = F,
        aes(xmin = start, xmax = end, ymin = -2, ymax = -0.5, fill = stain),
        size = 1,
        show.legend = FALSE
    ) +
    geom_segment(
        lineend = 'round',
        size = 0.75
    ) +
    scale_fill_manual(values = cyto_colors) +
    theme_void() +
    theme(
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        legend.position = 'right',
        panel.spacing.y = unit(0.5, "lines"),
        plot.margin = unit(c(0,0,0,0), "lines"),
        panel.border = element_blank(),
        strip.text.y = element_text(size = 14, hjust = 0.5, angle = 0),
        strip.background.y = element_rect(size = 1, fill = NA),
        strip.background.x = element_blank()
    ) +
    scale_y_discrete(expand = expansion(add = 1)) +
    scale_x_discrete(expand = c(0.02,0)) +
    facet_grid(study~chrom, space = 'free', scales = 'free', switch="both") +
    xlab('') +
    ylab('') +
    scale_color_manual(values = cnv_pal2, name = '')

do_plot(p_segs, 'comparison_genomewide.pdf', w = 16, h = 10)
In [69]:
##  EDITED MS - Reviewer 2 
acen = fread('/home/gaot/ref/acen.tsv')
colnames(acen) = c('chrom', 'start', 'end', 'arm', 'type')
chrom_lens = fread('/home/gaot/ref/chrom_lens.tsv') %>% filter(chrom != 24)

acen = acen %>% 
    mutate(chrom = str_remove(chrom, 'chr')) %>%
    filter(chrom != 'Y') %>%
    mutate(
        chrom = ifelse(chrom == 'X', 23, chrom),
        chrom = as.integer(chrom)
    ) %>%
    arrange(chrom) %>%
    group_by(chrom) %>%
    summarise(cen_start = min(start), cen_end = max(end), .groups = 'drop') %>%
    ungroup()

cytoband = fread('./cytoBand.txt') %>% setNames(c('chrom', 'start', 'end', 'name', 'stain'))

cytoband = cytoband %>% 
    mutate(chrom = str_remove(chrom, 'chr')) %>%
    mutate(chrom = ifelse(chrom == 'X', 23, chrom)) %>%
    filter(chrom != 'Y') %>%
    mutate(chrom = as.integer(chrom))

cyto_colors = c(
    'gpos100'= rgb(0/255.0,0/255.0,0/255.0),
    'gpos'   = rgb(0/255.0,0/255.0,0/255.0),
    'gpos75' = rgb(130/255.0,130/255.0,130/255.0),
    'gpos66' = rgb(160/255.0,160/255.0,160/255.0),
    'gpos50' = rgb(200/255.0,200/255.0,200/255.0),
    'gpos33' = rgb(210/255.0,210/255.0,210/255.0),
    'gpos25' = rgb(200/255.0,200/255.0,200/255.0),
    'gvar'   = rgb(220/255.0,220/255.0,220/255.0),
    'gneg'  = rgb(255/255.0,255/255.0,255/255.0),
    'acen'  = rgb(217/255.0,47/255.0,39/255.0),
    'stalk' = rgb(100/255.0,127/255.0,164/255.0)
)

window = 1e7

M_overlap = M_long %>%
    filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
    rowwise() %>%
    filter(
        any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
            start >= segs_filtered$start - window & start <= segs_filtered$end + window)
    ) %>%
    ungroup() %>%
    mutate(VariantClass = case_when(
        VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
        T ~ 'coding'
    ))

p_segs = segs_all %>%
filter(study %in% c("This study", "Loh")) %>%
#     arrange(study != 'Loh') %>%
    mutate(study = case_when(
        study == "Loh" & phi < 0.1 ~ "Loh <10%",
        study == "Loh" & phi >= 0.1 ~ "Loh >=10%",
        T ~ "This study")  ) %>%
    mutate(study = factor(study, unique(study))) %>%
    #filter(phi >= 0.1 | is.na(phi)) %>%
    filter(chrom != 23) %>%
    group_by(patient_id, chrom) %>%
    mutate(total_length = sum(size)) %>%
    mutate(type2 = factor(type2, c('AMP', 'CNLOH', 'DEL'))) %>%
    arrange(type2, round(start/1e7), -total_length) %>%
    group_by(chrom, study) %>%
    mutate(index = as.integer(factor(patient_id, unique(patient_id)))) %>% 
#     filter(index < 100) %>%
    ungroup() %>%
    ggplot(
        aes(x = start, xend = end, y = index, yend = index, color = type2)
    ) +
    geom_rect(
        inherit.aes = F,
        data = chrom_lens %>% filter(chrom != 23),
        aes(xmin = 0, xmax = length, ymin = -2, ymax = -0.5),
        color = 'black',
        fill = 'white',
        size = 1
    ) +
    geom_rect(
        data = cytoband %>% filter(chrom != 23),
        inherit.aes = F,
        aes(xmin = start, xmax = end, ymin = -2, ymax = -0.5, fill = stain),
        size = 1,
        show.legend = FALSE
    ) +
    geom_segment(
        lineend = 'round',
        size = 0.75
    ) +
    scale_fill_manual(values = cyto_colors) +
    theme_void() +
    theme(
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        legend.position = 'right',
        panel.spacing.y = unit(0.5, "lines"),
        plot.margin = unit(c(0,0,0,0), "lines"),
        panel.border = element_blank(),
        strip.text.y = element_text(size = 14, hjust = 0.5, angle = 0),
        strip.background.y = element_rect(size = 1, fill = NA),
        strip.background.x = element_blank()
    ) +
    scale_y_discrete(expand = expansion(add = 1)) +
    scale_x_discrete(expand = c(0.02,0)) +
    facet_grid(study~chrom, space = 'free', scales = 'free', switch="both") +
    xlab('') +
    ylab('') +
    scale_color_manual(values = cnv_pal2, name = '')

do_plot(p_segs, 'comparison_genomewide_MS_R2_10p.pdf', w = 16, h = 12)

Replicates

In [18]:
key = fread('/ifs/dmprequest/12-245/key.txt')[,1:2]
colnames(key) = c('dmp_sample_id', 'bam_name')
key = key %>% mutate(bam_dir = as.character(glue('/ifs/dmpshare/share/irb12_245/{bam_name}.bam')))
key = key %>% mutate(dmp_patient_id = substr(dmp_sample_id, 1, 9))
key = key %>% mutate(bait = substr(dmp_sample_id, nchar(dmp_sample_id) - 2, nchar(dmp_sample_id)))
# M_wide = M_wide %>% left_join(key %>% select(tumor_bam_dir = bam_dir, dmp_tumor_id = dmp_sample_id))

bam_replicates = key %>% 
    filter(str_detect(dmp_sample_id, 'IM'))  %>%
    filter(str_detect(dmp_sample_id, '-N'))  %>%
    group_by(dmp_patient_id) %>% 
    filter(n()>1) %>%
    ungroup()

sample_replicates = fread('./data/replicate_samples_parsed.txt')

sample_replicates = sample_replicates %>% 
    filter(QC == 'Passed' & ExemplarSampleType == 'Blood') %>% 
    mutate(DateofProcedureRep = as.Date(Date_of_Procedure, format = '%m/%d/%Y')) %>%
    rename(rep_sample_id = dmp_sample_id) %>%
    inner_join(M_wide %>% select(dmp_sample_id, dmp_patient_id, DateofProcedure)) %>%
    mutate(rep_type = ifelse(DateofProcedureRep == DateofProcedure, 'replicate', 'serial')) %>%
    filter(!is.na(DateofProcedureRep))

nrow(sample_replicates) %>% paste('total replicates')
count(sample_replicates, rep_type)

segs_replicates = fread('./data/segs_replicates.tsv', sep = '\t')
failed_samples = segs_replicates %>% filter(is.na(seg)) %>% pull(dmp_sample_id)

cnv_reps = segs_filtered %>% filter(dmp_patient_id %in% sample_replicates$dmp_patient_id)

cnv_reps = cnv_reps %>%
#     left_join(M_wide %>% select(dmp_sample_id, DateofProcedure), by = 'dmp_sample_id') %>%
    left_join(
        sample_replicates %>% select(rep_sample_id = dmp_sample_id, DateofProcedureRep, dmp_patient_id),
        by = 'dmp_patient_id')

cnv_reps = cnv_reps %>% filter(!(dmp_sample_id %in% failed_samples | rep_sample_id %in% failed_samples))
cnv_reps = cnv_reps %>% mutate(rep_type = ifelse(DateofProcedureRep == DateofProcedure, 'replicate', 'serial'))

# collect the segments
outdir = '/work/isabl/home/gaot/ch_cnv/results'

segs_reps = c(cnv_reps$dmp_sample_id, cnv_reps$rep_sample_id) %>%
    lapply(function(sid){
        segfile = glue('{outdir}/{sid}_seg.tsv')
        if (file.exists(segfile)) {
            seg = fread(segfile)
            seg %>% mutate(dmp_sample_id = sid) %>% filter(aberrant)
        } else {
            data.frame(dmp_sample_id = c(sid)) %>% mutate(dmp_sample_id = as.character(dmp_sample_id))
        }
    }) %>% Reduce(bind_rows, .)

segs_reps = segs_reps %>% left_join(bam_replicates, by = "dmp_sample_id")

segs_reps = segs_reps %>% mutate(
    z_x = ifelse(is.na(z_x), t_cnlr, z_x),
    z_y = ifelse(is.na(z_y), t_valor, z_y),
    p_x = ifelse(is.na(p_x), p_cnlr, p_x),
    p_y = ifelse(is.na(p_y), p_valor, p_y)
)

segs_reps = segs_reps %>%
    group_by(dmp_patient_id) %>%
    mutate(timepoint = 1:n()) %>%
    mutate(timepoint = as.character(timepoint)) %>%
    ungroup()

segs_reps = segs_reps %>% filter(type != 'err' & err < 0.06) 

# segs_serial %>% fwrite('./data/segs_serial_may26.tsv', sep = '\t')

segs_reps = segs_reps %>% 
    left_join(M_wide %>% select(dmp_sample_id, DateofProcedure), by = 'dmp_sample_id') %>%
    left_join(sample_replicates %>% select(dmp_sample_id, DateofProcedureRep) %>% distinct(), by = 'dmp_sample_id') %>%
    mutate(DateofProcedure = if_else(is.na(DateofProcedureRep), DateofProcedure, DateofProcedureRep)) %>%
    select(-DateofProcedureRep) %>% 
    arrange(dmp_patient_id, DateofProcedure, dmp_sample_id)
Joining, by = "dmp_patient_id"

'1200 total replicates'
A data.table: 2 × 2
rep_typen
<chr><int>
replicate919
serial 281
In [ ]:
# segs_replicates = segs_replicates %>% 
#     mutate(dmp_patient_id = substr(dmp_sample_id, 1, 9)) %>%
#     group_by(dmp_patient_id) %>%
#     filter(!any(is.na(seg))) %>%
#     ungroup() %>%
#     left_join(M_wide %>% select(dmp_patient_id, Gender, Sex), by = 'dmp_patient_id')

# segs_replicates = segs_replicates %>% filter(dmp_patient_id %in% sample_replicates$dmp_patient_id)

# segs_replicates = segs_replicates %>% filter((p_chisq < 1e-10 & p_y < 1e-3)) %>%
#     filter(type != 'err' & err < 0.06) %>%
#     filter(!(phi >= 0.8 & type %in% c('amp'))) %>%
#     filter(!(type == 'amp' & size < 24e6)) %>% 
#     filter(!(type == 'loh' & size < 8e6)) %>%
#     filter(!(chrom == 23 & Sex == 'Male' & type == 'loh')) %>%
#     filter(!(chrom == 23 & type == 'amp')) %>%
#     filter(cnlr <= 0.5)
In [19]:
options(repr.plot.width = 4, repr.plot.height = 3, repr.plot.res = 300)

segs_reps %>%
mutate(label = ifelse(timepoint == '1', chrom, '')) %>%
ggplot(
    aes(x = DateofProcedure, y = phi, group = dmp_patient_id, color = type, label = label, size = size)
) +
scale_color_manual(values = cnv_pal) +
scale_fill_manual(values = cnv_pal) +
geom_line(alpha = 0.4, size = 1) +
geom_point(pch = 21, alpha = 1) +
theme_classic() +
geom_text_repel(size = 2, force = 3) +
ylim(0,1) +
# scale_x_discrete(expand = c(0.1,0)) +
scale_x_date(date_breaks = "1 year", date_labels =  "%Y")
In [20]:
options(repr.plot.width = 10, repr.plot.height = 2, repr.plot.res = 300)
source('/work/isabl/home/gaot/facets-ch/R/facets-ch-utils.R')

samples = sample_replicates %>%
    filter(rep_type == 'replicate') %>%
    filter(!(dmp_sample_id %in% failed_samples | rep_sample_id %in% failed_samples)) %>%
    filter(dmp_patient_id %in% segs_reps$dmp_patient_id)

plots = list()
plots_rep = list()

for (i in 1:nrow(samples)) {
    
    outdir = '/work/isabl/home/gaot/ch_cnv/results'
    
    sid = samples[i,'dmp_sample_id']   
    rid = samples[i,'rep_sample_id']
    
    plotfile = glue('{outdir}/{sid}.png')
    seg = fread(glue('{outdir}/{sid}_seg.tsv'))
    sig = fread(glue('{outdir}/{sid}_sig.tsv'))
    seg = seg %>% mutate(aberrant = ifelse(cnlr > 0.5, F, aberrant))

    p1 = facets_plot(sig, seg, snp_distance = F)
    
    plots[[i]] = p1
    
    
    plotfile = glue('{outdir}/{rid}.png')
    seg_rep = fread(glue('{outdir}/{rid}_seg.tsv'))
    sig_rep = fread(glue('{outdir}/{rid}_sig.tsv'))
    seg_rep = seg_rep %>% mutate(aberrant = ifelse(cnlr > 0.5, F, aberrant))
    
    p2 = facets_plot(sig_rep, seg_rep, snp_distance = F)
    
    plots_rep[[i]] = p2
        
}
In [42]:
## EDITED MS - R2 scatter plot
df_scatter = data.frame()

for (i in 1:nrow(samples)) {
    
    outdir = '/work/isabl/home/gaot/ch_cnv/results'
    
    sid = as.character(samples[i,'dmp_sample_id'] )
    rid = as.character(samples[i,'rep_sample_id'] ) 
    
#     plotfile = glue('{outdir}/{sid}.png')
    seg = fread(glue('{outdir}/{sid}_seg.tsv'))
#     sig = fread(glue('{outdir}/{sid}_sig.tsv'))
    
    seg = seg %>% mutate(aberrant = ifelse(cnlr > 0.5, F, aberrant)) %>% 
        filter(aberrant ) %>% select(phi_rep1 = phi, chrom, type) 
  
    
#     p1 = facets_plot(sig, seg, snp_distance = F)
    
#     plots[[i]] = p1
    
    
#     plotfile = glue('{outdir}/{rid}.png')
    seg_rep = fread(glue('{outdir}/{rid}_seg.tsv'))
#     sig_rep = fread(glue('{outdir}/{rid}_sig.tsv'))
    seg_rep = seg_rep %>% mutate(aberrant = ifelse(cnlr > 0.5, F, aberrant)) %>% 
        filter(aberrant) %>% select(phi_rep2 = phi ) 
    df_scatter = rbind(df_scatter, cbind(seg, seg_rep))

        
}
In [65]:
### EDITED MS SCATTER REVIEWER 2 
df_scatter = df_scatter %>% 
    mutate(type2 = factor(c('del' = 'DEL', 'loh' = 'CNLOH', 'amp' = 'AMP')[type], c('DEL', 'CNLOH', 'AMP')))

p_scatter = ggplot(df_scatter, aes( x = phi_rep1, y = phi_rep2, label = chrom, color = type2)) + geom_abline(linetype = "dashed")  + geom_point() + 
geom_text_repel( ) + 
 ylim(0,1) + xlim(0,1) + theme_classic() + 
scale_color_manual(values = cnv_pal2)  + labs(color = " ") + xlab("Cell Fraction 1") + ylab("Cell Fraction 2")

do_plot(p_scatter, "R2_scatter_phi.pdf", h = 3, w = 4)

#CNLOH'darkgreen'AMP'darkblue'DEL'darkred'
Warning message:
“Removed 1 rows containing missing values (geom_point).”
Warning message:
“Removed 1 rows containing missing values (geom_text_repel).”
Warning message:
“Removed 1 rows containing missing values (geom_point).”
Warning message:
“Removed 1 rows containing missing values (geom_text_repel).”
In [21]:
options(warn = -1)
wrap_plots(plots, ncol = 1) %>% do_plot('rep_sample.pdf', w = 10, h = 12)
wrap_plots(plots_rep, ncol = 1) %>% do_plot('rep_rep.pdf', w = 10, h = 12)
options(warn = 0)

GM characterstics

In [202]:
D = M_long %>% 
    filter(ch_nonsilent == 1) %>%
    rowwise() %>%
    mutate(
        cis_cnv = any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
            start >= segs_filtered$start - window & start <= segs_filtered$end + window)
    ) %>%
    ungroup() %>%
    filter(cis_cnv == 0)

p = ggplot(
        D,
        aes(x = VAF_N * 2)
    ) +
    geom_density(
        trim = TRUE,
        alpha = 0.5,
        fill = sm_col,
        size = 0.5
    ) +
    geom_density(
        trim = TRUE,
        inherit.aes = F,
        data = segs_filtered,
        mapping = aes(x = phi),
        fill = cnv_col,
        alpha = 0.5,
        size = 0.5
    ) +
    xlim(0,1) +
    theme_classic() +
    xlab('Estimated cell fraction')

do_plot(p, 'sensitivity.pdf', w = 2, h = 1.9)
Warning message:
“Removed 6 rows containing non-finite values (stat_density).”
Warning message:
“Removed 6 rows containing non-finite values (stat_density).”
In [203]:
M_long %>% filter(ch_nonsilent == 1) %>% dim
M_long %>% filter(ch_nonsilent == 1) %>% pull(VAF_N) %>% median
M_long %>% filter(ch_nonsilent == 1) %>% pull(VAF_N) %>% range
D %>% pull(VAF_N) %>% {.*2} %>% range
M_wide %>% count(ch_nonsilent) %>% mutate(prop = n/sum(n))
  1. 14789
  2. 299
0.04451
  1. 0.02
  2. 0.79475
  1. 0.04
  2. 1.06268
A tibble: 2 × 3
ch_nonsilentnprop
<int><int><dbl>
0225880.6962579
1 98540.3037421
In [257]:
font_size = 8

M_long_nonsilent = M_long %>% 
    filter(!VariantClass %in% c('Silent', 'Intron')) %>%
    mutate(
        VariantClass3 = case_when(
            VariantClass == 'Frame_Shift_Del' ~ 'Frameshift indel',
            VariantClass == 'Frame_Shift_Ins' ~ 'Frameshift indel',
            VariantClass == 'In_Frame_Del' ~ 'Inframe indel',
            VariantClass == 'In_Frame_Ins' ~ 'Inframe indel',
            VariantClass == 'Splice_Site' ~ 'Splice or other',
            VariantClass == 'Splice_Region' ~ 'Splice or other',
            VariantClass == "5'Flank" ~ 'Splice or other',
            VariantClass == 'Translation_Start_Site' ~ 'Splice or other',
            VariantClass == 'Nonstop_Mutation' ~ 'Splice or other',
            VariantClass == 'Missense_Mutation' ~ 'Missense',
            VariantClass == 'Nonsense_Mutation' ~ 'Nonsense',
            T ~ VariantClass)
    ) %>%
    mutate(VariantClass3 = factor(VariantClass3))

panel_theme = theme(
    legend.direction = 'vertical',
    legend.position="top", 
    legend.text = element_text(size=12),
    plot.margin = margin(t = 0, r = 0, b = 0, l = 0),
    axis.text = element_text(size = font_size)
  )

# All patients

gene_list = M_long_nonsilent %>% count(Gene) %>% arrange(-n) %>% .$Gene %>% unique %>% .[1:30]

p_all = ggplot(
        M_long_nonsilent %>% filter(Gene %in% gene_list) %>% 
        mutate(Gene = as.integer(factor(Gene, gene_list))) %>%
        count(Gene, VariantClass3),
        aes(x = Gene, y = n, fill = VariantClass3)
    ) + 
    geom_bar(stat = 'identity', color = 'black', size = 0.2, width = 0.9) +
    ylab("Number of Mutations") +
    theme_classic() +
    theme(
        panel.grid.major = element_blank(), 
        panel.border = element_blank(),
        axis.line = element_line(colour = "white"),
        legend.title = element_blank()
    ) +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'right',
        strip.background = element_rect(fill = NA, size = 1, color = 'gray')
    ) +
    facet_zoom(xlim = c(5.5 + 0.1, 29.5 - 0.1), zoom.size = 1, ylim = c(0,500), horizontal = F, show.area = F) +
    scale_x_continuous(labels = gene_list, breaks = 1:length(gene_list), expand = expansion(add=0.1)) +
    scale_fill_discrete(drop = F)

# CNV subset

gene_list = M_long_nonsilent %>% 
    filter(!VariantClass %in% c('Silent', 'Intron')) %>%
    filter(dmp_patient_id %in% (M_wide %>% filter(ch_cnv == 1) %>% pull(dmp_patient_id))) %>% 
    count(Gene) %>% arrange(-n) %>% .$Gene %>% unique %>% .[1:30]

p_cnv = ggplot(
    M_long_nonsilent %>% 
    filter(Gene %in% gene_list) %>% 
    filter(dmp_patient_id %in% (M_wide %>% filter(ch_cnv == 1) %>% pull(dmp_patient_id))) %>% 
    mutate(Gene = as.integer(factor(Gene, gene_list))) %>%
    count(Gene, VariantClass3),
    aes(x = Gene, y = n, fill = VariantClass3)
) + 
geom_bar(stat = 'identity', color = 'black', size = 0.2) +
ylab("Number of Mutations") +
theme_classic() +
theme(
    panel.grid.major = element_blank(), 
    panel.border = element_blank(),
    axis.line = element_line(colour = "white"),
    legend.title = element_blank()
) +
theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = 'right',
    strip.background = element_rect(fill = NA, size = 1, color = 'gray')
) +
facet_zoom(xlim = c(5.5 + 0.1, 29.5 - 0.1), zoom.size = 1, ylim = c(0,15), horizontal = F, show.area = F) +
scale_x_continuous(labels = gene_list, breaks = 1:length(gene_list), expand = expansion(add=0.1)) +
scale_fill_discrete(drop = F)

do_plot(p_all, 'sm_bar_all.pdf', w = 8, h = 4)
do_plot(p_cnv, 'sm_bar_ai.pdf', w = 8, h = 4)
In [178]:
D = M_wide %>%
    filter(ch_nonsilent == 1) %>%
    mutate(mutnum_nonsilent = case_when(
        mutnum_nonsilent < 5 ~ as.character(mutnum_nonsilent), T ~ '5+')
    ) %>%
    count(mutnum_nonsilent) %>%
    mutate(prop = n/sum(n))

display(D)

p_n_all = ggplot(
        D,
        aes(x = mutnum_nonsilent, y = n)
    ) +
    theme_classic() +
    geom_bar(stat = 'identity', position = 'stack') +
    scale_fill_manual(values = cnv_pal) +
    theme(
        legend.position = 'none',
        axis.title = element_text(size = 8)
    ) +
    xlab('Events per patient') +
    ylab('Number of patients')
#     scale_y_continuous(expand = expansion(add = 0))


D = M_wide %>% 
    filter(ch_nonsilent == 1) %>%
    filter(ch_cnv == 1) %>%
    mutate(mutnum_nonsilent = case_when(
        mutnum_nonsilent < 5 ~ as.character(mutnum_nonsilent), T ~ '5+')
    ) %>%
    count(mutnum_nonsilent)

p_n_ai = ggplot(
        D,
        aes(x = mutnum_nonsilent, y = n)
    ) +
    theme_classic() +
    geom_bar(stat = 'identity', position = 'stack') +
    scale_fill_manual(values = cnv_pal) +
    theme(
        legend.position = 'none',
        axis.title = element_text(size = 8)
    ) +
    xlab('Events per patient') +
    ylab('Number of patients')
#     scale_y_continuous(expand = expansion(add = 0))

(p_n_all / p_n_ai) %>%
do_plot('sm_num.pdf', w = 2, h = 3.7)
A tibble: 5 × 3
mutnum_nonsilentnprop
<chr><int><dbl>
1 69190.702151411
2 20890.211995129
3 6050.061396387
4 1760.017860767
5+ 650.006596306
In [231]:
options(repr.plot.width = 5, repr.plot.height = 2, repr.plot.res = 300)

D = M_wide %>% 
    mutate(age_bin = cut(age, c(0, seq(20, 80, 10), 100))) %>%
    group_by(age_bin) %>%
    summarise(
        `GM VAF>2%` = sum(ch_nonsilent & VAF_nonsilent > 0.02)/n(),
        `GM VAF>5%` = sum(ch_nonsilent & VAF_nonsilent > 0.05)/n(),
        `GM VAF>10%` = sum(ch_nonsilent & VAF_nonsilent > 0.1)/n(),
        n = n()
    ) %>% 
    melt(measure.vars = c('GM VAF>2%', 'GM VAF>5%', 'GM VAF>10%'), variable.name = 'threshold', value.name = 'prop') %>%
    rbind(
        M_wide %>% 
        mutate(age_bin = cut(age, c(0, seq(20, 80, 10), 100))) %>%
        group_by(age_bin) %>%
        summarise(
            prop = sum(ch_cnv)/n(),
            n = n()
        ) %>%
        mutate(threshold = 'mCA')
    )

p_age = ggplot(
    D,
    aes(x = age_bin, y = prop, group = threshold, color = threshold)
) +
geom_line() +
theme_classic() +
geom_point() +
ylab('Proportion of subjects') +
xlab('Age') +
labs(color = '')

p_age %>%
do_plot('sm_age.pdf', w = 5, h = 2)
`summarise()` ungrouping output (override with `.groups` argument)

Warning message in melt(., measure.vars = c("GM VAF>2%", "GM VAF>5%", "GM VAF>10%"), :
“The melt generic in data.table has been passed a tbl_df and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(.). In the next version, this warning will become an error.”
`summarise()` ungrouping output (override with `.groups` argument)

Paired tMN

In [957]:
tmns_sequenced = fread('/work/isabl/home/gaot/ch_cnv/data/sequenced_tmns_processed.tsv', sep = '\t')
tmns_sequenced = tmns_sequenced %>% mutate(Gender = substr(Gender, 1,1)) %>%
    mutate(isabl_exp_id = `Experiment System ID`)
tmns_sequenced = tmns_sequenced %>% inner_join(M_wide %>% select(
    dmp_patient_id, dmp_sample_id, valid_diagnosis, seq_to_diag, post_tmn, post_anyblood, heme_cat, ch_cnv, ch_nonsilent
), by = "dmp_patient_id")
tmns_sequenced = tmns_sequenced %>% filter(valid_diagnosis == 1)
tmns_sequenced = tmns_sequenced %>% filter(heme_cat %in% c('AML', 'MDS')) 

tmns_sequenced %>% count(heme_cat)

tmns_sequenced %>% count(bait)

tmns_sequenced %>% count(ch_cnv)
tmns_sequenced %>% count(ch_nonsilent)
A data.table: 2 × 2
heme_catn
<chr><int>
AML 4
MDS11
A data.table: 2 × 2
baitn
<chr><int>
HEMEPACT 1
IWG 14
A data.table: 2 × 2
ch_cnvn
<dbl><int>
011
1 4
A data.table: 2 × 2
ch_nonsilentn
<int><int>
08
17
In [961]:
# tmns_sequenced %>% fwrite('./data/paired_tmn.tsv', sep = '\t')

Serial LOY

In [ ]:
segs_y_serial = fread('./data/y_segs_serial.tsv', sep = '\t')

segs_y_serial = segs_y_serial %>%
    group_by(dmp_patient_id) %>%
    mutate(timepoint = 1:n()) %>%
    mutate(timepoint = as.character(timepoint)) %>%
    ungroup()
In [ ]:
options(repr.plot.width = 4, repr.plot.height = 2.5, repr.plot.res = 300)

segs_y_serial %>%
ggplot(
    aes(x = timepoint, y = cnlr, group = dmp_patient_id)
) +
geom_line() +
geom_point(pch = 21, alpha = 0.4) +
theme_classic() +
scale_x_discrete(expand = c(0.1,0))

Associations with exposures and gene mutations

Venn

In [227]:
library(VennDiagram)
library(RColorBrewer)
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")

venn.diagram(
    x = list(M_wide[M_wide$ch_cnv == 1,][['dmp_patient_id']], M_wide[M_wide$ch_nonsilent == 1,][['dmp_patient_id']]),
    category.names = c('mCA', 'GM'),
    
    # output features
    filename = './figures/venn.tiff',
    imagetype="tiff",
    height = 800, 
    width = 800, 
#     resolution = 300,
    compression = "lzw",
    output = TRUE,
    margin = 0.1,
    
    # circles
    lwd = 1,
    fill = c(cnv_col, sm_col),

    # Numbers
    cex = 0.6,
    fontface = "bold",
    fontfamily = "sans",
    ext.dist = 0.06,
    ext.line.lwd = 0.8,
    
    # Set names
    cat.cex = 0.55,
    cat.fontface = "bold",
    cat.default.pos = "outer",
    cat.fontfamily = "sans",
    cat.dist = 0.1
)
NULL
1
In [73]:
display(table(M_wide$ch_cnv, M_wide$ch_nonsilent))
fisher.test(table(M_wide$ch_cnv, M_wide$ch_nonsilent))
   
        0     1
  0 22459  9637
  1   129   217
	Fisher's Exact Test for Count Data

data:  table(M_wide$ch_cnv, M_wide$ch_nonsilent)
p-value < 2.2e-16
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 3.133768 4.919801
sample estimates:
odds ratio 
   3.92049 
In [66]:
glm(
    formula = ch_cnv ~ ch_nonsilent + age + age_sq,
    data = M_wide %>% mutate(age_sq = age^2)
) %>%
sjPlot::get_model_data(type = 'est')
A data.frame: 3 × 14
termestimatestd.errorconf.lowconf.highstatisticdf.errorp.valuep.starsp.labelgroupxposxminxmax
<fct><dbl><dbl><dbl><dbl><dbl><int><dbl><chr><chr><chr><dbl><dbl><dbl>
ch_nonsilent 1.203327e-021.314030e-03 9.457820e-03 1.460872e-02 9.157532324385.612187e-20***0.01 *** pos32.8253.175
age -1.330650e-032.693367e-04-1.858540e-03-8.027599e-04-4.940471324387.832153e-07***-0.00 ***neg21.8252.175
age_sq 1.480888e-052.324557e-06 1.025283e-05 1.936492e-05 6.370623324381.907831e-10***0.00 *** pos10.8251.175

Volcano

In [70]:
genes = M_long %>% count(Gene) %>% filter(n >= 30) %>% pull(Gene)

res = data.frame()

for (gene in genes) {
    
    test = fisher.test(table(M_wide[[gene]], M_wide[['ch_cnv']]))
    
    res = rbind(
        res,
        data.frame(
            'p.value' = test$p.value,
            'Gene' = gene,
            'OR' = test$estimate
        )
    )
}

res = res %>% mutate(q = p.adjust(p.value, n = length(genes), method = 'BH'))
In [72]:
length(genes)
138
In [71]:
p_volcano = ggplot(
    res %>% mutate(label = ifelse(q < 0.05, as.character(Gene), NA)),
    aes(x = OR, y = -log(q), color = q < 0.05, label = label)
) +
geom_hline(yintercept = -log(0.05), linetype = 'dashed') +
geom_point() +
geom_text_repel(size = 2.5, seed = 3) +
theme_classic() +
scale_color_manual(values = c('gray', 'darkred')) +
theme(
    legend.position = 'none'
)

n_cnv = sum(M_wide$ch_cnv)
n_normal = nrow(M_wide) - sum(M_wide$ch_cnv)

genes = M_long %>% count(Gene) %>% arrange(-n) %>% filter(n > 30) %>% pull(Gene)

D = M_long %>%
    filter(ch_nonsilent == 1) %>%
    count(Gene, ch_cnv) %>%
    tidyr::complete(Gene, ch_cnv, fill = list(n = 0)) %>% 
    mutate(n_total = ifelse(ch_cnv == 1, n_cnv, n_normal)) %>% 
    group_by(ch_cnv) %>%
    mutate(prop = n/n_total) %>%
    left_join(
        res,
        by = 'Gene'
    ) %>%
    filter(q < 0.05) %>%
    arrange(desc(n)) %>%
    mutate(Gene = factor(Gene, Gene))

p_genes = ggplot(
    D %>% mutate(ch_cnv = ifelse(ch_cnv == 1, 'mCA+', 'mCA-')),
    aes(x = Gene, y = prop, fill = ch_cnv)
) +
theme_classic() +
geom_col(
    position = position_dodge(),
    width = 0.75,
    size = 0.2,
    color = 'black'
) +
scale_y_continuous(expand = expansion(add = 0.001), labels = abs) +
scale_fill_manual(values = c(cnv_col, 'gainsboro')) +
theme(
    axis.text.x = element_text(angle = 30, hjust = 1, size = 7),
    legend.title = element_blank(),
    legend.position = 'right'
) +
ylab('Proportion of patients') +
xlab('')

panel = (p_volcano | p_genes) + plot_layout(widths = c(1,3))

do_plot(panel, 'sm_enrichment.pdf', w = 7.5, h = 2.2)
Warning message:
“Removed 131 rows containing missing values (geom_text_repel).”
Warning message:
“Removed 131 rows containing missing values (geom_text_repel).”

mCA ~ VAF & Mutnum

In [12]:
D = M_wide %>% 
    mutate(mutnum_bin = ifelse(mutnum_nonsilent >= 4, '4+', as.character(mutnum_nonsilent))) %>%
    group_by(mutnum_bin) %>%
    summarise(
        n_cnv = sum(ch_cnv),
        total = n(),
        prop = n_cnv/total,
        `.groups` = 'drop'
    ) %>%
    mutate(
        prop_lower = qbinom(p = 0.05, size = total, prob = prop)/total,
        prop_upper = qbinom(p = 0.95, size = total, prob = prop)/total,
    )

p_mutnum = ggplot(
        D,
        aes(x = mutnum_bin, y = prop, group = '', ymin = prop_lower, ymax = prop_upper)
    ) +
    geom_ribbon(fill = 'royalblue', alpha = 0.3) +
    geom_line(color = 'black') +
    geom_point(color = 'black') +
    theme_classic() +
    ylab('Prop. mCA') +
    xlab('Number of mutations') +
    scale_y_continuous(expand = expansion(mult = 0.05), limits = c(0,NA))

display(D)

D = M_wide %>%
    mutate(VAF_bin = case_when(
        VAF_trans == 0 ~ '<2%',
        VAF_trans < 0.05 ~ '2-5%',
        VAF_trans < 0.1 ~ '5-10%',
        VAF_trans < 0.2 ~ '10-20%',
        T ~ '>20%'
    )) %>%
    arrange(VAF_trans) %>%
    mutate(VAF_bin = factor(VAF_bin, unique(VAF_bin))) %>%
    group_by(VAF_bin) %>%
    summarise(
        n_cnv = sum(ch_cnv),
        total = n(),
        prop = n_cnv/total,
        `.groups` = 'drop'
    ) %>%
    mutate(
        prop_lower = qbinom(p = 0.05, size = total, prob = prop)/total,
        prop_upper = qbinom(p = 0.95, size = total, prob = prop)/total,
    )

display(D)

p_vaf = ggplot(
        D,
        aes(x = VAF_bin, y = prop, ymin = prop_lower, ymax = prop_upper, group = '')
    ) +
    geom_ribbon(fill = 'royalblue', alpha = 0.3) +
    geom_line(color = 'black') +
    geom_point(color = 'black') +
    theme_classic() +
    ylab('Prop. mCA') +
    xlab('Max VAF') +
    scale_y_continuous(expand = expansion(mult = 0.05), limits = c(0,NA)) +
    theme(axis.text.x = element_text(angle = 30, hjust = 1))

panel = p_mutnum | p_vaf

do_plot(panel, 'mutnum_vaf.pdf', w = 4, h = 2)
A tibble: 5 × 6
mutnum_binn_cnvtotalpropprop_lowerprop_upper
<chr><dbl><int><dbl><dbl><dbl>
0 129225880.0057109970.0049141140.006552152
1 113 69190.0163318400.0138748370.018933372
2 55 20890.0263283870.0205840110.032072762
3 34 6050.0561983470.0413223140.072727273
4+ 15 2410.0622406640.0373443980.087136929
A tibble: 5 × 6
VAF_binn_cnvtotalpropprop_lowerprop_upper
<fct><dbl><int><dbl><dbl><dbl>
<2% 143226020.0063268740.0054862400.007211751
2-5% 57 48160.0118355480.0093438540.014534884
5-10% 50 24330.0205507600.0160295930.025482943
10-20% 46 15530.0296200900.0225370250.036703155
>20% 50 10380.0481695570.0375722540.059730250
In [15]:
D = M_wide %>% 
    mutate(mutnum_bin = ifelse(mutnum_nonsilent >= 3, '3+', as.character(mutnum_nonsilent))) %>%
    group_by(mutnum_bin) %>%
    summarise(
        n_cnv = sum(ch_cnv),
        total = n(),
        prop = n_cnv/total,
        `.groups` = 'drop'
    ) %>%
    mutate(
        prop_lower = qbinom(p = 0.05, size = total, prob = prop)/total,
        prop_upper = qbinom(p = 0.95, size = total, prob = prop)/total,
    )

D
A tibble: 4 × 6
mutnum_binn_cnvtotalpropprop_lowerprop_upper
<chr><dbl><int><dbl><dbl><dbl>
0 129225880.0057109970.0049141140.006552152
1 113 69190.0163318400.0138748370.018933372
2 55 20890.0263283870.0205840110.032072762
3+ 49 8460.0579196220.0449172580.070921986
In [14]:
sum(M_wide$ch_cnv)/nrow(M_wide)
0.0106651871031379
In [939]:
M_wide %>%
    mutate(mutnum_bin = ifelse(mutnum_trans >= 4, '4+', as.character(mutnum_trans))) %>%
    filter(smoke != 'missing') %>%
    filter(mutnum_bin != '0') %>%
    glm(
        ch_cnv_auto ~ age10 + mutnum_trans + VAF_trans,
        data = .,
        family = 'binomial'
    ) %>% 
    sjPlot::get_model_data(type = 'est') %>%
    mutate(feature = 'mutnum')
A data.frame: 3 × 15
termestimatestd.errorconf.lowconf.highstatisticdf.errorp.valuep.starsp.labelgroupxposxminxmaxfeature
<fct><dbl><dbl><dbl><dbl><dbl><int><dbl><chr><chr><chr><dbl><dbl><dbl><chr>
age10 1.4074690.07970730 1.2062793 1.6485964.28810693971.802030e-05***1.41 *** pos32.8253.175mutnum
mutnum_trans 1.0893850.07273881 0.9395555 1.2501771.17699193972.391990e-01 1.09 pos21.8252.175mutnum
VAF_trans 187.6933240.6964393046.5463119716.8000807.51653493975.624762e-14***187.69 ***pos10.8251.175mutnum
In [288]:
options(repr.plot.width = 6, repr.plot.height = 1.5, repr.plot.res = 300)

M_wide %>%
mutate_at(
    c('ch_my_pd', 'ch_cnv', 'ch_all'),
    as.character
) %>%
mutate(ch_status = case_when(
    ch_my_pd == 1 ~ 'ch_my_pd',
    ch_all == 1 ~ 'ch_other',
    T ~ 'none'
)) %>%
mutate(ch_status = factor(ch_status, rev(unique(ch_status)))) %>%
count(ch_status, ch_cnv) %>%
group_by(ch_cnv) %>%
mutate(prop = n/sum(n)) %>%
ungroup() %>%
ggplot(
    aes(x = ch_cnv, y = prop, fill = ch_status, label = n)
) +
geom_col() +
theme_classic() +
coord_flip() + 
geom_text(position = position_stack(vjust = 0.5), color = 'white') +
scale_y_continuous(expand = c(0,0))
In [ ]:
# use stacked bar instead
In [644]:
options(repr.plot.width = 7, repr.plot.height = 3, repr.plot.res = 300)

germline_cause = c('11=', '1=', '15+', '23-', '14=', '16=', '8=', '10-', '15-', '15=')

# https://www.nejm.org/doi/full/10.1056/NEJMoa1516192
aml_cnvs = c('7-', '8+', '5-', '17-', '9-', '12-', '21+')

# https://www.nature.com/articles/nature14666
cll_cnvs = c('12+', '13-', '11-', '17-', '6-')

# elsa's data
mds_cnvs = c('5-', '8+', '7-', '20-', '11-', '17-', '17=', '4=', '13-', '21+')

# 
mpn_cnvs = c('9=', '20-', '1=', '17=', '7=', '14=')

D = segs_filtered %>% 
    mutate_at(
        c('ch_sm'),
        as.character
    ) %>%
    mutate(mutnum = ifelse(mutnum_nonsilent > 2, '3+', as.character(mutnum_nonsilent))) %>%
    mutate(germline_cause = cnv_label %in% germline_cause) %>%
    group_by(cnv_label) %>%
    mutate(
        prop_sm = sum(ch_nonsilent)/n(),
        mutnum_median = mean(mutnum_nonsilent),
        total = n()
    ) %>%
    filter(total >= 4) %>%
    ungroup() %>%
    arrange(mutnum_median) %>%
    mutate(cnv_label = factor(cnv_label, unique(cnv_label))) %>%
    count(mutnum, cnv_label) %>%
    group_by(cnv_label) %>%
    mutate(
        total = sum(n),
        prop = n/total
    ) %>%
    ungroup() %>%
    arrange(cnv_label) %>%
    mutate(
        category = case_when(
            cnv_label %in% germline_cause ~ 'germline',
            cnv_label %in% c(aml_cnvs, cll_cnvs) ~ 'CLL/AML',
            T ~ 'none')
    )

ggplot(
    D,
    aes(x = cnv_label, y = prop, fill = mutnum, label = total)
) +
theme_classic() +
geom_col(size = 0) +
scale_y_continuous(expand = c(0,0), limits = c(0, 1.05)) +
theme(
    axis.text.x = element_text(
        angle = 30, hjust = 0.5, size = 9,
#         color = case_when(
#             unique(D$cnv_label) %in% germline_cause ~ 'blue',
#             unique(D$cnv_label) %in% c(aml_cnvs, cll_cnvs) ~ 'red',
#             T ~ 'black'),
        face = 'bold'),
    legend.box = "horizontal",
    legend.position = 'top'
) +
# guides(
#     colour = guide_legend(override.aes = list(size = 1, fill = NA)),
#     fill = guide_legend(override.aes = list(size = 2))
# ) +
# annotate('text', label = D$total, x = D$cnv_label, y = 1, vjust = -0.1, size = 2) +
xlab('') +
ylab('proportion') +
scale_fill_brewer(palette = 'Reds', name = '#GM')
# scale_color_manual(values = c('germline' = 'blue', 'CLL/AML' = 'red', 'none' = 'black'), name = 'etiology')

Exposures

In [254]:
summ_nonsilent_imputed = M_wide %>%
    filter(!therapy_detailed & smoke != 'missing') %>%
    ungroup() %>%
    glm(
        ch_nonsilent ~ Gender + race_b + age10 + smoke + eqd_3_t + pct_cytotoxic_therapy,
        data = .,
        family = 'binomial'
    ) %>% 
    sjPlot::get_model_data(type = 'est')

summ_nonsilent_detailed = M_wide %>%
    filter(therapy_detailed & smoke != 'missing') %>%
    ungroup() %>%
    glm(
        ch_nonsilent ~ Gender + race_b + age10 + smoke + eqd_3_t + pct_cytotoxic_therapy,
        data = .,
        family = 'binomial'
    ) %>% 
    sjPlot::get_model_data(type = 'est')
Registered S3 methods overwritten by 'lme4':
  method                          from
  cooks.distance.influence.merMod car 
  influence.merMod                car 
  dfbeta.influence.merMod         car 
  dfbetas.influence.merMod        car 

In [1076]:
class_c = fread('./data/clinicalreview/chemoclass_jan2019_revised.csv') %>% pull(drugclass_c) %>% unique
In [211]:
options(repr.plot.width = 4, repr.plot.height = 2.5, repr.plot.res = 500)

options(warn=-1)

summ_cnv = M_wide %>%
    filter(smoke != 'missing') %>%
    mutate(smoke = as.integer(smoke)) %>%
    glm(
        ch_cnv_auto ~ Gender + race_b + age10 + smoke,
        data = .,
        family = 'binomial'
    ) %>% 
    sjPlot::get_model_data(type = 'est')

summ_cnv_therapy = M_wide %>%
    filter(therapy_detailed & smoke != 'missing') %>%
    filter(DateofProcedure == DOP_kelly) %>%
    group_by(tumor_type) %>% mutate(n_tumor_type = n()) %>% ungroup() %>%
    mutate(tumor_type = ifelse(n_tumor_type < 100, 'Other', tumor_type)) %>%
    mutate(
#         eqd_3_t = as.character(eqd_3_t),
#         pct_cytotoxic_therapy = as.character(pct_cytotoxic_therapy),
        smoke = as.integer(smoke)
    ) %>%
    glm(
        ch_cnv_auto ~ Gender + race_b + age10 + smoke + XRT + ind_cytotoxic_therapy,
        data = .,
        family = 'binomial'
    ) %>% 
    sjPlot::get_model_data(type = 'est')

options(warn=0)

rbind(
    summ_cnv %>% mutate(cohort = 'full'),
    summ_cnv_therapy %>% mutate(cohort = 'therapy')
)
A data.frame: 10 × 15
termestimatestd.errorconf.lowconf.highstatisticdf.errorp.valuep.starsp.labelgroupxposxminxmaxcohort
<fct><dbl><dbl><dbl><dbl><dbl><int><dbl><chr><chr><chr><dbl><dbl><dbl><chr>
GenderMale 1.34780760.119293441.06744141.704679 2.502060308651.234732e-02* 1.35 * pos43.8254.175full
race_b 1.47630570.182628711.04698932.146831 2.132977308653.292663e-02* 1.48 * pos32.8253.175full
age10 1.76459300.053640341.59025521.96238410.587556308653.403625e-26***1.76 ***pos21.8252.175full
smoke 1.18128270.123821230.92855881.509500 1.345495308651.784654e-01 1.18 pos10.8251.175full
GenderMale 1.42614180.210384960.94608012.163800 1.687254101939.155460e-02 1.43 pos65.8256.175therapy
race_b 1.44080660.313031900.81143712.798474 1.166664101932.433460e-01 1.44 pos54.8255.175therapy
age10 1.68625730.095414171.40338912.039946 5.476246101934.344439e-08***1.69 ***pos43.8254.175therapy
smoke 1.34728200.224227310.87579132.116020 1.329406101931.837139e-01 1.35 pos32.8253.175therapy
XRT 1.66936630.223129391.07417292.581366 2.296623101932.164028e-02* 1.67 * pos21.8252.175therapy
ind_cytotoxic_therapy0.87809830.222498450.56708421.359272-0.584259101935.590461e-01 0.88 neg10.8251.175therapy
In [214]:
p_forest = rbind(
        summ_cnv %>% mutate(cohort = paste0('Full (n=', nrow(M_wide[M_wide$smoke != 'missing',]), ')')),
        summ_cnv_therapy %>% mutate(cohort = paste0('Therapy (n=', sum(M_wide[M_wide$smoke != 'missing',]$therapy_detailed), ')'))
    ) %>%
    arrange(cohort) %>%
    mutate(cohort = factor(cohort, rev(unique(cohort)))) %>%
    filter(!str_detect(term, 'tumor_type')) %>%
    mutate(term = factor(term, c('age10', 'GenderMale', 'race_b', 'smoke', 'XRT', 'ind_cytotoxic_therapy'))) %>%
    arrange(term) %>%
    mutate(
        term = c('age10' = 'Age', 'GenderMale' = 'Male Gender', 'race_b' = 'White Race',
                'smoke' = 'Ever Smoker', 'XRT' = 'External Beam Radiation', 'ind_cytotoxic_therapy' = 'Cytotoxic Therapy')[term]
    ) %>%
    mutate(term = factor(term, rev(unique(term)))) %>%
    plot_forest(
        x = 'term',
        label = 'p.label',
        eb_w = 0,
        eb_s = 1,
        ps = 2,
        or_s = 3,
        nudge = -0.4,
        OR = T,
        col = 'cohort',
        dodge_width = 0.7
    ) +
    theme_classic() +
    ylab('OR') +
    xlab('') +
    scale_color_discrete(name = 'Cohort')

do_plot(p_forest, 'forest.pdf', w = 6, h = 4)

Global effects

In [3]:
top_genes = c('DNMT3A', 'TET2', 'ASXL1', 'EZH2', 'TP53', 'ATM', 'CHEK2', 'PPM1D',
        'SRSF2', 'SF3B1', 'U2AF1', 'MPL', 'JAK2', 'TERT')

global_effects = data.frame()

for (gene in top_genes) {
        
    D = M_wide %>% 
        filter(!is.na(get(gene))) %>%
        mutate(multi_cnv_chrom = as.integer(n_cnv_chrom > 1)) %>%
        filter(ch_cnv_auto == 1)
    
    n_co = sum(D[[paste0(gene, '')]] == 1 & D[['multi_cnv_chrom']] == 1)
    
    if (n_co >= 2) {
        
        pval = fisher.test(table(D[[paste0(gene, '')]], D[['multi_cnv_chrom']]))$p.value
        
        global_effects = rbind(
            global_effects,
            data.frame(
                'Gene' = gene,
                'intersect_count' = n_co,
                'pval' = pval
            )
        )
    }
}

global_effects = global_effects %>% mutate(q = p.adjust(pval, method = 'BH', n = length(top_genes)))
In [4]:
global_effects
A data.frame: 5 × 4
Geneintersect_countpvalq
<fct><int><dbl><dbl>
DNMT3A40.7657285531.00000000
TET2 30.7317768871.00000000
TP53 50.0015139090.02119473
ATM 30.0683102140.47817150
PPM1D 20.1145428980.53453353
In [112]:
source('/work/isabl/home/gaot/facets-ch/R/facets-ch-utils.R')

plots = list()
complex_samples = M_wide %>% filter(n_cnv_chrom >= 2 & TP53) %>% arrange(VAF_TP53) %>% pull(dmp_sample_id)

for (sid in (complex_samples)) {
    outdir = '/work/isabl/home/gaot/ch_cnv/results'

    seg = fread(glue('{outdir}/{sid}_seg.tsv'))
    sig = fread(glue('{outdir}/{sid}_sig.tsv'))
    
    plots[[sid]] = facets_plot(sig, seg, title = sid, snp_distance = F)
}
In [113]:
panel = wrap_plots(plots, ncol = 1)
options(warn = -1)
do_plot(panel, 'complex.pdf', w = 10, h = 8)
options(warn = 0)

Cis effects

In [5]:
cis_genes = c('DNMT3A', 'TET2', 'ASXL1', 'EZH2', 'TP53', 'ATM', 'CHEK2', 'PPM1D',
        'SRSF2', 'SF3B1', 'U2AF1', 'MPL', 'JAK2', 'TERT')

cis_gene_bed = gene_bed %>% filter(Gene %in% cis_genes)

overlap_gene = function(seg_chrom, seg_start, seg_end, window = 5e6) {
    
    res = cis_gene_bed %>% 
        mutate(overlap = chrom == seg_chrom & start > seg_start - window & end < seg_end + window) %>%
        pull(overlap) %>% as.integer()
    
    return(res)
}

cnv_overlaps = mapply(
        overlap_gene,
        segs_filtered$chrom, segs_filtered$start, segs_filtered$end
    ) %>%
    t %>%
    as.data.frame() %>%
    setNames(paste0(cis_gene_bed$Gene, '_cnv'))

display('overlapped gene with CNV')

segs_filtered = cbind(
    segs_filtered %>% 
        select_if((!colnames(.) %in% colnames(cnv_overlaps)) | colnames(.) == 'dmp_patient_id'), 
    cnv_overlaps)

segs_filtered = segs_filtered %>%
    mutate_at(colnames(cnv_overlaps), list(amp = function(x){x * (.[['type']] == 'amp')})) %>%
    mutate_at(colnames(cnv_overlaps), list(deloh = function(x){x * (.[['type']] %in% c('del', 'loh'))})) %>%
    mutate_at(colnames(cnv_overlaps), list(del = function(x){x * (.[['type']] == 'del')})) %>%
    mutate_at(colnames(cnv_overlaps), list(loh = function(x){x * (.[['type']] == 'loh')}))

GENE_CNV = segs_filtered %>% 
    group_by(dmp_patient_id) %>%
    summarise_at(
        str_subset(colnames(.), '_cnv'),
        max
    )

M_wide = M_wide %>%
    select_if((!colnames(.) %in% colnames(GENE_CNV)) | colnames(.) == 'dmp_patient_id') %>%
    left_join(
        GENE_CNV,
        by = "dmp_patient_id",
        fill = 0
    ) %>%
    mutate_at(colnames(GENE_CNV), function(x){ifelse(is.na(x), 0, x)})

display('merging with main dataframe')

GENEALL = M_long %>%
    reshape2::dcast(
      formula = dmp_patient_id ~ Gene,
      value.var = 'Gene',
      fun.aggregate = function(x) {min(1, length(x))}
    ) %>%
    setNames(c('dmp_patient_id', paste0(colnames(.)[-1], '_all')))

M_wide = M_wide %>%
    select_if((!colnames(.) %in% colnames(GENEALL)) | colnames(.) == 'dmp_patient_id') %>%
    left_join(GENEALL, by = "dmp_patient_id") %>%
    mutate_at(colnames(GENEALL), function(x){ifelse(is.na(x), 0, x)})

segs_filtered = segs_filtered %>%
    select_if((!colnames(.) %in% colnames(GENEALL)) | colnames(.) == 'dmp_patient_id') %>%
    left_join(GENEALL, by = "dmp_patient_id") %>%
    mutate_at(colnames(GENEALL), function(x){ifelse(is.na(x), 0, x)})
'overlapped gene with CNV'
'merging with main dataframe'
In [6]:
cis_effects = data.frame()

for (gene in cis_genes) {
        
    D = M_wide %>% filter(!is.na(get(gene)))
    
    sm_count = sum(D[[paste0(gene, '')]])
    cnv_count = sum(D[[paste0(gene, '_cnv')]])
    del_count = sum(D[[paste0(gene, '_del')]])
    loh_count = sum(D[[paste0(gene, '_loh')]])
    intersect_cnv = sum(D[[paste0(gene, '')]] == 1 & D[[paste0(gene, '_cnv')]] == 1)
    intersect_del = sum(D[[paste0(gene, '')]] == 1 & D[[paste0(gene, '_cnv_del')]] == 1)
    intersect_loh = sum(D[[paste0(gene, '')]] == 1 & D[[paste0(gene, '_cnv_loh')]] == 1)
    
    #  pval_all = fisher.test(table(D[[paste0(gene, '')]], D[[paste0(gene, '_cnv')]]))$p.value

    if (intersect_del >= 2) {
        
        pval = fisher.test(table(D[[paste0(gene, '')]], D[[paste0(gene, '_cnv_del')]]))$p.value
        
        cis_effects = rbind(
            cis_effects,
            data.frame(
                'Gene' = gene,
                'cnv_type' = 'del',
                'sm_count' = sm_count,
                'cnv_count' = del_count,
                'intersect_count' = intersect_del,
                'pval' = pval
            )
        )
    }
    
    if (intersect_loh >= 2) {
        
        pval = fisher.test(table(D[[paste0(gene, '')]], D[[paste0(gene, '_cnv_loh')]]))$p.value
        
        cis_effects = rbind(
            cis_effects,
            data.frame(
                'Gene' = gene,
                'cnv_type' = 'loh',
                'sm_count' = sm_count,
                'cnv_count' = del_count,
                'intersect_count' = intersect_loh,
                'pval' = pval
            )
        )
    }
}

cis_effects = cis_effects %>% mutate(q = p.adjust(pval, method = 'BH', n = length(cis_genes) * 2))

cis_effects = cis_effects %>%
    mutate(Gene = as.character(Gene)) %>%
    left_join(gene_bed, by = 'Gene') %>%
    arrange(chrom) %>% mutate(Gene = factor(Gene, unique(Gene)))
In [7]:
cis_effects
A data.frame: 10 × 10
Genecnv_typesm_countcnv_countintersect_countpvalqchromstartend
<fct><fct><int><int><int><dbl><dbl><int><dbl><dbl>
MPL loh 230 42.412140e-112.251330e-10 1 43803478 43818443
DNMT3Adel37120 53.724484e-021.042855e-01 2 25455845 25565459
TET2 del11090 41.032992e-034.131968e-03 4106067032106200973
TET2 loh11090 52.697662e-071.888363e-06 4106067032106200973
EZH2 loh 430 63.766733e-185.273427e-17 7148504475148581413
JAK2 loh 1040112.547642e-277.133399e-26 9 4985033 5128183
ATM del 2280 21.713692e-035.331487e-0311108093211108239829
ATM loh 2280 21.713692e-035.331487e-0311108093211108239829
TP53 del 3540 21.187343e-045.540933e-0417 7565097 7590856
TP53 loh 3540 34.364776e-052.444275e-0417 7565097 7590856
In [8]:
gene_cnvs = function(gene, qval) {
    
    all_colors = c(
        'amp' = 'darkblue', 'loh' = 'darkgreen', 'del' = 'darkred',
        'Regulatory' = 'darkturquoise', 'Missense' = 'salmon', 'Truncating' = 'purple') 

    gene_chrom = gene_bed %>% filter(Gene == gene) %>% pull(chrom)
    
    gene_structure = gene_structures %>% filter(str_detect(description, gene))

    gene_locs = gene_bed %>% 
        filter(Gene == gene) %>%
        as.data.frame(stringsAsFactors = F) %>%
        mutate(loc = as.integer((start + end) / 2), chrom = as.integer(chrom))

    this_chrom_lens = chrom_lens %>% filter(chrom == gene_chrom)

    acen_chrom = acen %>% filter(chrom == gene_chrom)

    D = segs_filtered %>%
        filter(chrom == gene_chrom) %>%
        mutate(chrom = factor(chrom)) %>%
        group_by(dmp_sample_id, chrom) %>%
        mutate(total_length = sum(size)) %>%
        mutate(type = factor(type, c('amp', 'loh', 'del'))) %>%
        arrange(desc(type), round(start/1e7), -total_length) %>%
        group_by(chrom) %>%
        mutate(index = as.integer(factor(dmp_sample_id, unique(dmp_sample_id)))) %>%
        ungroup()

    M_overlap = M_long %>%
        filter(Gene == gene) %>%
        filter(dmp_patient_id %in% D$dmp_patient_id) %>%
        rowwise() %>%
        filter(
            any(chrom == D$chrom & dmp_patient_id == D$dmp_patient_id)
#                 start >= D$start - window & start <= D$end + window)
        ) %>%
        left_join(
            D %>% select(dmp_sample_id, index),
            by = 'dmp_sample_id'
        ) %>%
        ungroup() %>%
        mutate(VariantClass = case_when(
            VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
            T ~ 'coding'
        ))

    p_segs = ggplot(
            D,
            aes(x = start, xend = end, y = index, yend = index, color = type)
        ) +
        geom_rect(
            inherit.aes = F,
            data = acen_chrom,
            aes(xmin = cen_start, xmax = cen_end, ymin = -Inf, ymax = Inf),
            fill = 'gray',
            size = 0,
            alpha = 0.3
        ) +
        geom_segment(
            size = 1,
            alpha = 0.8,
            lineend = 'round'
        ) +
        geom_segment(
            inherit.aes = F,
            data = this_chrom_lens,
            aes(x = 0, xend = length, y = 0, yend = 0),
            color = 'black',
            size = 0
        ) +
        # label the mutations
#         geom_point(
#             inherit.aes = F,
#             mapping = aes(x = start, y = index, color = VariantClass2),
#             data = M_overlap,
#             size = 6.5,
#             pch = '*',
#             color = 'white'
#         ) +
        geom_point(
            inherit.aes = F,
            mapping = aes(x = start, y = index, fill = VariantClass2),
            data = M_overlap,
            size = 1.5,
            color = 'black',
            pch = 21
        ) +
#         geom_text_repel(
#             inherit.aes = F,
#             mapping = aes(x = start, y = index, color = VariantClass2, label = AAchange),
#             data = M_overlap %>% mutate(zoom = TRUE),
#             size = 1.5,
#             segment.size = 0.2,
#             color = 'black',
#             ylim = c(0,NA)
#         ) +
        scale_alpha(
            range = c(0, 1)
        ) +
        theme_classic() +
        theme(
            axis.text = element_blank(),
            axis.ticks = element_blank(),
            axis.title = element_blank(),
            legend.position = 'none',
            panel.spacing.y = unit(1, "mm"),
            plot.margin = unit(c(0.5,0.2,0.2,0.2), "lines"),
            strip.text.y = element_blank(),
            panel.border = element_rect(size = 0.2, fill = NA, color = 'gray'),
            strip.background = element_rect(size = 0.5, fill = 'seashell3', color = 'seashell3'),
            plot.title = element_text(size = 7)
        ) +
        scale_y_discrete(expand = expansion(add = 1)) +
        xlab('') +
        ylab('') +
        scale_color_manual(values = all_colors) +
        scale_fill_manual(values = all_colors) +
        ggtitle(paste0(gene, ', chr', gene_chrom)) +
        # exon structures
        geom_segment(
            inherit.aes = F,
            data = gene_structure %>% mutate(zoom = TRUE),
            aes(x = min(start), xend = max(end), y = -1, yend = -1),
            size = 0.1,
            color = 'black'
        ) +
        geom_segment(
            inherit.aes = F,
            data = gene_structure %>% mutate(zoom = TRUE),
            aes(x = start, xend = end, y = -1, yend = -1),
            size = 2,
            color = 'black'
        ) +
        facet_zoom(
            zoom.size = 1,
            zoom.data = zoom,
            xlim = c(gene_locs$start, gene_locs$end)
        )

    p_segs
}
In [9]:
plots = list()

for (i in 1:nrow(cis_effects)) {
    gene = as.character(cis_effects[i,'Gene'])
    qval = cis_effects[i,'q']
    pval = cis_effects[i,'pval']
    plots[[gene]] = gene_cnvs(gene, pval)
}

p_cis = plot_grid(plotlist = plots, nrow = 2)

do_plot(p_cis, 'cis_loci.pdf', w = 6, h = 5)
In [13]:
cis_total = sum(cis_effects$intersect_count)
cis_total %>% paste0(' total cis events')
(cis_total/nrow(segs_filtered_auto)) %>% signif(3) %>% {.*100} %>% paste0('% of total events')
'44 total cis events'
'13.1% of total events'
In [16]:
window = 5e6

segs_filtered_auto = segs_filtered %>% filter(chrom != 23)

M_overlap = M_long %>%
    filter(dmp_patient_id %in% segs_filtered_auto$dmp_patient_id) %>%
    rowwise() %>%
    mutate(
        locality = ifelse(
            any(chrom == segs_filtered_auto$chrom & dmp_patient_id == segs_filtered_auto$dmp_patient_id &
                start >= segs_filtered_auto$start - window & start <= segs_filtered_auto$end + window
            ),
            'cis',
            'trans'
        )
    ) %>%
    ungroup() %>%
    mutate(variant_type = case_when(
        VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
        T ~ 'coding'
    ))


M_overlap_nonsilent = M_overlap %>% filter(variant_type == 'coding')

segs_filtered_auto %>%
    rowwise() %>%
    mutate(
        cis_coding = as.integer(
            any(chrom == M_overlap_nonsilent$chrom & dmp_patient_id == M_overlap_nonsilent$dmp_patient_id &
                M_overlap_nonsilent$start >= start - window & M_overlap_nonsilent$start <= end + window
            ))
    ) %>%
    group_by(type) %>%
    summarise(
        n = n(),
        prop_cis = sum(cis_coding)/n
    )
`summarise()` ungrouping output (override with `.groups` argument)

A tibble: 3 × 3
typenprop_cis
<chr><int><dbl>
amp 560.08928571
del1560.08333333
loh1230.27642276

export CNV cases without GM for review

In [674]:
genes = c('ATM', 'TET2', 'EZH2', 'DNMT3A', 'JAK2', 'MPL', 'TP53')

M_wide %>% 
filter(
    (ATM_cnv | TET2_cnv | EZH2_cnv | DNMT3A_cnv | JAK2_cnv | MPL_cnv | TP53_cnv)
) %>%
setDT() %>%
data.table::melt(
    measure.vars = list(
        'CNV' = paste0(genes, '_cnv'),
        'GM' = genes
    ),
    variable.name = 'Gene',
    variable.factor = FALSE
) %>%
mutate(
    Gene = setNames(genes, 1:length(genes))[Gene]
) %>%
filter(CNV == 1 & GM != 1) %>%
select(dmp_sample_id, Gene, CNV, GM) #%>%
#fwrite('./data/sm_review_jun10.tsv', sep = '\t')

GM CNV VAF correlation

In [233]:
window = 1e7

M_overlap = M_long %>%
    filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
    rowwise() %>%
    filter(
        any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
           start >= segs_filtered$start - window & start <= segs_filtered$end + window)
    ) %>%
    ungroup() %>%
    mutate(VariantClass = case_when(
        VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
        str_detect(VariantClass, 'Del|Ins') ~ 'indel',
        T ~ 'snv'
    )) %>%
    filter(VariantClass != 'noncoding') %>%
    filter(Gene %in% c("EZH2", "ATM", "TET2", "JAK2", "MPL", 'DNMT3A', 'TP53')) %>%
    arrange(desc(VAF_N)) %>%
    distinct(dmp_patient_id, `.keep_all` = T)
    

p_vaf = segs_filtered %>%
    left_join(
        M_overlap %>% select(dmp_patient_id, chrom, sm_pos = start, Gene, VariantClass, VAF_N),
        by = c('chrom', 'dmp_patient_id')
    ) %>%
    filter(type != 'amp') %>%
    filter(VAF_N > 0) %>%
    mutate(
        phi_sm = case_when(
            type == 'del' ~ 2 * VAF_N/(1 + VAF_N),
            type == 'loh' ~ VAF_N
        )
    ) %>%
    mutate(pigeonhole = phi_sm + phi > 1) %>%
    ggplot(
        aes(x = VAF_N, y = phi, color = type2, label = Gene)
    ) +
    # stat_function(fun = function(v){2*v}, linetype = 'dashed', color = cnv_pal['err']) +
    # stat_function(fun = function(v){2 * v/(1+v)}, linetype = 'dashed', color = cnv_pal['del'], alpha = 0.5) +
    # geom_abline(linetype = 'dashed', color = cnv_pal['loh'], alpha = 0.5) +
    stat_function(fun = function(v){2 * v}, linetype = 'dashed', color = cnv_pal['err']) +
    stat_function(fun = function(v){2 * v/(1+v)}, linetype = 'dashed', color = cnv_pal['del'], alpha = 0.5) +
    geom_abline(linetype = 'dashed', color = cnv_pal['loh'], alpha = 0.5) +
#     annotate('text', x = 0.35, y = 1, label = 'trans', color = cnv_pal['err'], size = 3) +
#     annotate('text', x = 0.75, y = 1, label = 'DEL', color = cnv_pal['del'], size = 3, alpha = 0.5) +
#     annotate('text', x = 0.9, y = 0.7, label = 'CNLOH', color = cnv_pal['loh'], size = 3, alpha = 0.5) +
    theme_classic() +
    # stat_cor(label.y = 0.95, size = 3) +
    scale_color_manual(values = cnv_pal2) +
    geom_point(
        aes(pch = pigeonhole),
#         pch = 21
    ) +
    scale_shape_manual(values = c('TRUE' = 19, 'FALSE' = 21)) +
    xlim(0,1) +
    ylim(0,1) +
    geom_text(size = 1, vjust = -1, hjust = -0.2) +
    facet_wrap(~type2, nrow = 1, scale = 'free') +
    theme(
        legend.position = 'none',
        strip.background = element_blank()
    ) +
    xlab('Gene mutation VAF') +
    ylab('mCA cell fraction')

do_plot(p_vaf, 'cis_vaf.pdf', w = 4.6, h = 2.5)
Warning message:
“Removed 50 row(s) containing missing values (geom_path).”
Warning message:
“Removed 50 row(s) containing missing values (geom_path).”

microarray validation

In [1173]:
double_hits_cyto = M_wide %>% filter(
    (MPL & chr1 & VAF_MPL < 0.35) |
    (DNMT3A & chr2 & VAF_DNMT3A < 0.35) |
    (TET2 & chr4 & VAF_TET2 < 0.35) |
    (EZH2 & chr7 & VAF_EZH2 < 0.35) |
    (JAK2 & chr9 & VAF_JAK2 < 0.35) |
    (ATM & chr11 & VAF_ATM < 0.35) |
    (TP53 & chr17 & VAF_TP53 < 0.35)
) %>% 
mutate(hits = case_when(
    MPL & chr1 ~ 'MPL',
    DNMT3A & chr2 ~ 'DNMT3A',
    TET2 & chr4 ~ 'TET2',
    EZH2 & chr7 ~ 'EZH2',
    JAK2 & chr9 ~ 'JAK2',
    ATM & chr11 ~ 'ATM',
    TP53 & chr17 ~ 'TP53'
)) %>%
select(dmp_patient_id, dmp_sample_id, hits) %>%
arrange(hits)

# double_hits_cyto %>% fwrite('./data/double_hits_cyto.tsv', sep = '\t')
In [234]:
options(repr.plot.width = 6, repr.plot.height = 2.3, repr.plot.res = 300)

window = 1e7

M_overlap = M_long %>%
    filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
    rowwise() %>%
    filter(
        !any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
           start >= segs_filtered$start - window & start <= segs_filtered$end + window)
    ) %>%
    ungroup() %>%
    mutate(VariantClass = case_when(
        VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
        str_detect(VariantClass, 'Del|Ins') ~ 'indel',
        T ~ 'snv'
    )) %>%
    filter(VariantClass != 'noncoding') %>%
    arrange(desc(VAF_N)) %>%
    distinct(dmp_patient_id, `.keep_all` = T)

p_vaf = segs_filtered %>%
    arrange(desc(phi)) %>%
    distinct(dmp_patient_id, `.keep_all` = T) %>%
    left_join(
        M_overlap %>% select(dmp_patient_id, chrom, sm_pos = start, Gene, VariantClass, VAF_N),
        by = c('dmp_patient_id')
    ) %>%
    mutate(pigeonhole = 2 * VAF_N + phi > 1) %>%
    ggplot(
        aes(x = VAF_N, y = phi, color = type2, label = Gene)
    ) +
    stat_function(fun = function(v){2*v}, linetype = 'dashed', color = cnv_pal['err']) +
    stat_function(fun = function(v){2 * v/(1+v)}, linetype = 'dashed', color = cnv_pal['del'], alpha = 0.5) +
    geom_abline(linetype = 'dashed', color = cnv_pal['loh'], alpha = 0.5) +
#     annotate('text', x = 0.35, y = 1, label = 'trans', color = cnv_pal['err'], size = 3) +
#     annotate('text', x = 0.75, y = 1, label = 'cis-del', color = cnv_pal['del'], size = 3, alpha = 0.5) +
#     annotate('text', x = 0.9, y = 0.7, label = 'cis-loh', color = cnv_pal['loh'], size = 3, alpha = 0.5) +
    theme_classic() +
    # geom_abline(linetype = 'dashed') +
    scale_shape_manual(values = c('TRUE' = 19, 'FALSE' = 21)) +
    # stat_cor(label.y = 0.98, size = 3) +
    scale_color_manual(values = cnv_pal2) +
    geom_point(
    #     pch = 21,
        aes(pch = pigeonhole)
    ) +
    xlim(0,1) +
    ylim(0,1) +
    # geom_text(size = 1, vjust = -1, hjust = -0.2) +
    # scale_shape_manual(values = c(1,5)) +
    facet_wrap(~type2, nrow = 1, scale = 'free') +
    theme(
        legend.position = 'none',
        strip.background = element_blank()
    ) +
    ylab('mCA cell fraction') +
    xlab('Gene mutation VAF')

do_plot(p_vaf, 'trans_vaf.pdf', w = 6.8, h = 2.5)
Warning message:
“Removed 50 row(s) containing missing values (geom_path).”
Warning message:
“Removed 145 rows containing missing values (geom_point).”
Warning message:
“Removed 50 row(s) containing missing values (geom_path).”
Warning message:
“Removed 145 rows containing missing values (geom_point).”

View samples

In [576]:
M_wide %>% filter(nonparta == 0 & chr7_del == 1 & ch_nonsilent) %>% filter(dmp_patient_id %in% segs_filtered_rerun)
In [714]:
sid = 'P-0014033-N01-IM5'
outdir = '/work/isabl/public/facets-ch/res'
seg = fread(glue('{outdir}/{sid}_seg.tsv'))
sig = fread(glue('{outdir}/{sid}_sig.tsv'))
In [712]:
source('/work/isabl/home/gaot/facets-ch/R/facets-ch-utils.R')

seg_prime = call_aberrant(sig, get_seg(sig), p_threshold = 5e-5)
In [680]:
segs_filtered_rerun2 = segs_filtered_rerun %>% filter((p_chisq < 1e-10 & p_y < 1e-3))
In [ ]:
M_wide %>% filter(dmp_patient_id == 'P-0018453') %>% pull(heme_cat)
In [277]:
options(repr.plot.width = 10, repr.plot.height = 2.5, repr.plot.res = 300)

samples = segs_filtered %>% filter(dmp_patient_id == 'P-0018453')

for (i in 1:nrow(samples)) {
    
    outdir = '/work/isabl/home/gaot/ch_cnv/results'
    
    sid = samples[i,'dmp_sample_id']    
    
    plotfile = glue('{outdir}/{sid}.png')
    seg = fread(glue('{outdir}/{sid}_seg.tsv'))
    display(seg %>% filter(aberrant) %>% select(phi, err, type, p_chisq, start, end))
    display_png(file = plotfile)
    
}
A data.table: 1 × 6
phierrtypep_chisqstartend
<dbl><dbl><chr><dbl><int><int>
0.360.00083874loh042201333676027

Blood count phenotypes

In [244]:
give.median <- function(x){
  return(c(x = median(x), label = median(x))) 
}

D = M_wide %>% 
    filter(anc < 30 & alc < 8 & amc < 3 & plt < 1000 | is.na(anc) | is.na(alc) | is.na(amc) | is.na(plt)) %>%
    mutate(group = ifelse(ch_cnv == 1, 'mCA+', 'mCA-')) %>%
    mutate(group = factor(group, c('mCA+', 'mCA-'))) %>%
    group_by(group) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    mutate(group_label = paste0(group, '\n(n=', n, ')')) %>%
    mutate(group_label = factor(group_label, unique(group_label))) %>%
    reshape2::melt(measure.vars = cbcs[cbcs != 'rbc']) %>%
    mutate(variable = str_wrap(cbc_labels[variable], width = 3))

p_cbc = ggplot(
    D,
    aes(x = value, fill = group_label)
) +
geom_histogram(bins = 20, show.legend = F) +
# stat_summary(aes(y = 0), fun.data = give.median, geom = "text", vjust = 0, size = 2) +
facet_grid(group_label~variable, scale = 'free') +
scale_y_continuous(expand = c(0.2,0)) +
scale_x_continuous(expand = c(0.1,0)) +
geom_text(
    inherit.aes = F,
    data = D %>% group_by(group_label, variable) %>% summarise(median = median(na.omit(value))),
    mapping = aes(y = 0, x = median, label = median),
    size = 3,
    vjust = 1
) +
theme_bw() +
theme(
    panel.spacing = unit(1, "mm"),
    legend.position = 'top',
    legend.title = element_blank(),
    axis.text.x = element_text(size = 8, angle = 30, hjust = 1),
    strip.text = element_text(size = 6),
    strip.background = element_blank()
) +
xlab('') +
ylab('Number of subjects')

do_plot(p_cbc, 'cbc_dist.pdf', w = 8, h = 2.5)
`summarise()` regrouping output by 'group_label' (override with `.groups` argument)

Warning message:
“Removed 26252 rows containing non-finite values (stat_bin).”
Warning message:
“Removed 26252 rows containing non-finite values (stat_bin).”

WHO criteria for cytopenia

In [245]:
M_wide = M_wide %>% 
    mutate(
        thrombocytopenia = plt < 100,
        anemia = hgb < 10,
        neutropenia = anc < 1.8,
        cytopenia = thrombocytopenia | anemia | neutropenia
    )

D = M_wide %>% 
    mutate(group = factor(ifelse(ch_cnv == 1, 'mCA+', 'mCA-'), c('mCA-', 'mCA+'))) %>%
#     mutate(group = factor(group, c('none', 'sm', 'cnv', 'both'))) %>%
    select(group, thrombocytopenia, anemia, neutropenia, cytopenia) %>%
    filter(complete.cases(.)) %>%
    group_by(group) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    mutate(group_label = paste0(group, '(n=', n, ')')) %>%
    mutate(group_label = factor(group_label, unique(group_label))) %>%
    reshape2::melt(measure.vars = c('thrombocytopenia', 'anemia', 'neutropenia', 'cytopenia')) %>%
    mutate(variable = ifelse(variable == 'cytopenia', 'any cytopenia', as.character(variable))) %>%
    mutate(variable = Hmisc::capitalize(variable)) %>%
    mutate(variable = factor(variable, unique(variable))) %>%
    group_by(group_label, group, variable) %>%
    summarise(
        total = n(),
        prop = sum(value)/total,
        lower = qbinom(p = 0.05, size = total, prob = prop)/total,
        upper = qbinom(p = 0.95, size = total, prob = prop)/total,
    )

p_cytopenia = ggplot(
    D,
    aes(x = group, y = prop, ymin = lower, ymax = upper, color = group_label)
) +
geom_errorbar(position = position_dodge(width = 0.3), width = 0.2) +
geom_point(position = position_dodge(width = 0.3)) +
theme_classic() +
facet_wrap(~variable, scale = 'free_x', nrow = 1) +
ylab('Proportion of subjects') +
xlab('') +
theme(
    legend.title = element_blank(),
    strip.text = element_text(size = 7)
#     legend.position = 'none'
)

do_plot(p_cytopenia, 'cytopenia.pdf', w = 6.5, h = 2.2)
`summarise()` regrouping output by 'group_label', 'group' (override with `.groups` argument)

In [73]:
unique(M_wide$ch_cnv_auto)
  1. 0
  2. 1
In [74]:
unique(M_wide$ch_nonsilent_auto)
  1. 1
  2. 0
In [150]:
#### R1 Blood counts mCA +/- GM violin plots MS edited

M_wide_Bcounts = M_wide %>% mutate (category = ifelse( ch_cnv_auto  & ch_nonsilent_auto, "mCA + GM", "Rest" )) %>%
filter(anc < 30 & alc < 8 & amc < 3 & plt < 1000 | is.na(anc) | is.na(alc) | is.na(amc) | is.na(plt))  %>% 
group_by(category) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    mutate(category_label = paste0(category, '\n(n=', n, ')')) %>%
    mutate(category_label = factor(category_label, unique(category_label))) %>%
reshape2::melt(measure.vars = cbcs, variable.name = 'bc', value.name = 'count')


p = ggplot(M_wide_Bcounts, aes(x = category, y = count, fill = category_label)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 1) + 
    geom_boxplot(width=0.1, outlier.alpha = 0)+  theme_classic() + labs(fill = "")
do_plot(p, "R1_bloodcounts_mCA_vs_rest_n.pdf", w = 15, h = 2)
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
In [134]:
#### R1 Blood counts trans violin plots MS edited

# segs_filtered_auto = segs_filtered_auto %>% mutate(
#         trans_effect = 
#             (chrom == 3 & type == 'amp' & TP53) |
#             (chrom == 4 & type == 'del' & (!TET2) & (SRSF2 | ATM | CHEK2 | ASXL1)) |
#             (chrom == 5 & type == 'del' & TP53) |
#             (chrom == 7 & type == 'del' & (TP53 | PPM1D)) |
#             (chrom == 8 & type == 'amp' & TET2) |
#             (chrom == 12 & type == 'amp' & (NOTCH1 | MYD88 | FBXW7)) |
#             (chrom == 13 & type == 'del' & ATM) |
#             (chrom == 20 & type == 'del' & (U2AF1 | TERT))
#     )



M_wide_trans = M_wide %>% mutate(
        trans_effect = 
            (chr3_amp & TP53) |
            (chr4_del & (!TET2) & (SRSF2 | ATM | CHEK2 | ASXL1)) |
            (chr5_del & TP53) |
            (chr7_del & (TP53 | PPM1D)) |
            (chr8_amp & TET2) |
            (chr12_amp & (NOTCH1 | MYD88 | FBXW7)) |
            (chr13_del & ATM) |
            (chr20_del & (U2AF1 | TERT))
    ) %>% mutate(
        cis_effect = 
            (chr1_loh & MPL) |
            (chr2_del & DNMT3A) | 
            ((chr4_del |chr4_loh) & TET2) | 
            (chr7_loh & EZH2) | 
            (chr9_loh & JAK2) | 
            ((chr11_loh | chr11_del) & ATM) | 
            (chr17 & TP53)
)  %>% filter(anc < 30 & alc < 8 & amc < 3 & plt < 1000 | is.na(anc) | is.na(alc) | is.na(amc) | is.na(plt))  %>%
mutate(category = ifelse(trans_effect|cis_effect , "cis/trans", "Rest")) %>% group_by(category) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    mutate(category_label = paste0(category, '\n(n=', n, ')')) %>%
    mutate(category_label = factor(category_label, unique(category_label))) %>%
reshape2::melt(measure.vars = cbcs, variable.name = 'bc', value.name = 'count') 




# M_wide_trans %>% count(trans_effect | cis_effect)



p = ggplot(M_wide_trans, aes(x = category , y = count)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 1) + geom_boxplot(width=0.1, outlier.alpha = 0)+  theme_classic()
do_plot(p, "R1_bloodcounts_cistrans_vs_rest.pdf", w = 15, h = 2)


p = ggplot(M_wide_trans, aes(x = category, y = count, fill = category_label)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 1) + 
    geom_boxplot(width=0.1, outlier.alpha = 0)+  theme_classic() + labs(fill = "")
do_plot(p, "R1_bloodcounts_cistrans_vs_rest_n.pdf", w = 15, h = 2)
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
In [105]:
M_wide %>% count(heme_cat2)
A tibble: 9 × 2
heme_cat2n
<fct><int>
MDS 26
MPN 7
AML 11
CML 4
CLL 12
B-NHL 27
T-NHL 2
MM 7
NA 32346
In [142]:
M_wide %>% filter(alc>5 & heme_cat2 == "CLL") %>% count(ch_cnv)
A tibble: 2 × 2
ch_cnvn
<dbl><int>
02
11
In [147]:
segs_filtered %>% filter(alc>5 & heme_cat2 == "CLL") %>% count(chrom, dmp_sample_id, type)
A tibble: 2 × 4
chromdmp_sample_idtypen
<int><chr><chr><int>
1P-0013232-N01-IM5del1
13P-0013232-N01-IM5loh1
In [138]:
#### R1 Blood counts heme category disease violin plots MS edited

M_wide_hemecat = M_wide %>% mutate( heme_cat3 = case_when( 
                                    heme_cat2 %in%c( "MDS", "MPN", "AML", "CML") ~ "Myeloid", 
                                    heme_cat2 %in% "CLL" ~ c("CLL"),
                                    heme_cat2 %in% c("B-NHL", "T-NHL") ~ "Lymphoid" , 
                                    T  ~ "None" 


)) %>% filter(anc < 30 & alc < 8 & amc < 3 & plt < 1000 | is.na(anc) | is.na(alc) | is.na(amc) | is.na(plt))  %>% 
    group_by(heme_cat3) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    mutate(category_label = paste0(heme_cat3, '\n(n=', n, ')')) %>%
    mutate(category_label = factor(category_label, unique(category_label))) %>%
reshape2::melt(measure.vars = cbcs, variable.name = 'bc', value.name = 'count') 


p = ggplot(M_wide_hemecat, aes(x = heme_cat3 , y = count)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 1) + 
    geom_boxplot(width=0.1, outlier.alpha = 0)+  theme_classic()+ theme(axis.text.x = element_text(angle = 45, hjust=1))
do_plot(p, "R1_bloodcounts_heme_disease_1row.pdf", w = 20, h = 2) 


p = ggplot(M_wide_hemecat, aes(x = heme_cat3, y = count, fill = category_label)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 1) + 
    geom_boxplot(width=0.1, outlier.alpha = 0)+  theme_classic() + labs(fill = "") + theme(axis.text.x = element_text(angle = 45, hjust=1))
do_plot(p, "R1_bloodcounts_heme_disease_1row_n.pdf", w = 15, h = 2)

p = ggplot(M_wide_hemecat, aes(x = heme_cat3 , y = count)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 2) + 
    geom_boxplot(width=0.1, outlier.alpha = 0)+  theme_classic()
do_plot(p, "R1_bloodcounts_heme_disease_2row.pdf", w = 15, h = 5)
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_ydensity).”
Warning message:
“Removed 29013 rows containing non-finite values (stat_boxplot).”
In [97]:
colnames(M_wide)
  1. 'dmp_patient_id'
  2. 'ABL1'
  3. 'ACVR1'
  4. 'AKT1'
  5. 'AKT2'
  6. 'AKT3'
  7. 'ALK'
  8. 'ALOX12B'
  9. 'AMER1'
  10. 'ANKRD11'
  11. 'APC'
  12. 'AR'
  13. 'ARAF'
  14. 'ARID1A'
  15. 'ARID1B'
  16. 'ARID2'
  17. 'ARID5B'
  18. 'ASXL1'
  19. 'ASXL2'
  20. 'ATM'
  21. 'ATR'
  22. 'ATRX'
  23. 'AURKA'
  24. 'AURKB'
  25. 'AXIN1'
  26. 'AXIN2'
  27. 'AXL'
  28. 'B2M'
  29. 'BABAM1'
  30. 'BAP1'
  31. 'BARD1'
  32. 'BBC3'
  33. 'BCL10'
  34. 'BCL2'
  35. 'BCL2L11'
  36. 'BCL6'
  37. 'BCOR'
  38. 'BIRC3'
  39. 'BLM'
  40. 'BMPR1A'
  41. 'BRAF'
  42. 'BRCA1'
  43. 'BRCA2'
  44. 'BRD4'
  45. 'BRIP1'
  46. 'BTK'
  47. 'CALR'
  48. 'CARD11'
  49. 'CARM1'
  50. 'CASP8'
  51. 'CBFB'
  52. 'CBL'
  53. 'CCND1'
  54. 'CCND2'
  55. 'CCND3'
  56. 'CCNE1'
  57. 'CD274'
  58. 'CD276'
  59. 'CD79A'
  60. 'CD79B'
  61. 'CDC42'
  62. 'CDC73'
  63. 'CDH1'
  64. 'CDK12'
  65. 'CDK4'
  66. 'CDK6'
  67. 'CDK8'
  68. 'CDKN1A'
  69. 'CDKN1B'
  70. 'CDKN2B'
  71. 'CDKN2C'
  72. 'CEBPA'
  73. 'CENPA'
  74. 'CHEK1'
  75. 'CHEK2'
  76. 'CIC'
  77. 'CREBBP'
  78. 'CRKL'
  79. 'CRLF2'
  80. 'CSDE1'
  81. 'CSF1R'
  82. 'CSF3R'
  83. 'CTCF'
  84. 'CTLA4'
  85. 'CTNNB1'
  86. 'CUL3'
  87. 'CXCR4'
  88. 'CYLD'
  89. 'CYSLTR2'
  90. 'DAXX'
  91. 'DCUN1D1'
  92. 'DDR2'
  93. 'DICER1'
  94. 'DIS3'
  95. 'DNAJB1'
  96. 'DNMT1'
  97. 'DNMT3A'
  98. 'DNMT3B'
  99. 'DOT1L'
  100. 'DROSHA'
  101. 'DUSP4'
  102. 'E2F3'
  103. 'EED'
  104. 'EGFL7'
  105. 'EGFR'
  106. 'EIF1AX'
  107. 'EIF4A2'
  108. 'ELF3'
  109. 'EP300'
  110. 'EPAS1'
  111. 'EPCAM'
  112. 'EPHA3'
  113. 'EPHA5'
  114. 'EPHA7'
  115. 'EPHB1'
  116. 'ERBB2'
  117. 'ERBB3'
  118. 'ERBB4'
  119. 'ERCC2'
  120. 'ERCC3'
  121. 'ERCC4'
  122. 'ERCC5'
  123. 'ERF'
  124. 'ERG'
  125. 'ERRFI1'
  126. 'ESR1'
  127. 'ETV1'
  128. 'ETV6'
  129. 'EZH1'
  130. 'EZH2'
  131. 'FAM175A'
  132. 'FAM46C'
  133. 'FAM58A'
  134. 'FANCA'
  135. 'FANCC'
  136. 'FAT1'
  137. 'FBXW7'
  138. 'FGF19'
  139. 'FGF3'
  140. 'FGF4'
  141. 'FGFR1'
  142. 'FGFR2'
  143. 'FGFR3'
  144. 'FGFR4'
  145. 'FH'
  146. 'FLCN'
  147. 'FLT1'
  148. 'FLT3'
  149. 'FLT4'
  150. 'FOXA1'
  151. 'FOXL2'
  152. 'FOXO1'
  153. 'FOXP1'
  154. 'FUBP1'
  155. 'FYN'
  156. 'GATA1'
  157. 'GATA2'
  158. 'GATA3'
  159. 'GLI1'
  160. 'GNA11'
  161. 'GNAQ'
  162. 'GNAS'
  163. 'GPS2'
  164. 'GREM1'
  165. 'GRIN2A'
  166. 'GSK3B'
  167. 'H3F3A'
  168. 'H3F3B'
  169. 'H3F3C'
  170. 'HGF'
  171. 'HIST1H1C'
  172. 'HIST1H2BD'
  173. 'HIST1H3A'
  174. 'HIST1H3B'
  175. 'HIST1H3C'
  176. 'HIST1H3D'
  177. 'HIST1H3E'
  178. 'HIST1H3F'
  179. 'HIST1H3G'
  180. 'HIST1H3H'
  181. 'HIST1H3I'
  182. 'HIST1H3J'
  183. 'HIST3H3'
  184. 'HLA-A'
  185. 'HLA-B'
  186. 'HNF1A'
  187. 'HOXB13'
  188. 'HRAS'
  189. 'ICOSLG'
  190. 'ID3'
  191. 'IDH1'
  192. 'IDH2'
  193. 'IFNGR1'
  194. 'IGF1'
  195. 'IGF1R'
  196. 'IGF2'
  197. 'IKBKE'
  198. 'IKZF1'
  199. 'IL10'
  200. 'IL7R'
  201. 'chr18'
  202. 'chr19'
  203. 'chr20'
  204. 'chr21'
  205. 'chr22'
  206. 'chr23'
  207. 'ch_cnv'
  208. 'chr1_del'
  209. 'chr1_loh'
  210. 'chr2_amp'
  211. 'chr2_del'
  212. 'chr2_loh'
  213. 'chr3_amp'
  214. 'chr3_del'
  215. 'chr3_loh'
  216. 'chr4_del'
  217. 'chr4_loh'
  218. 'chr5_del'
  219. 'chr5_loh'
  220. 'chr6_del'
  221. 'chr6_loh'
  222. 'chr7_del'
  223. 'chr7_loh'
  224. 'chr8_amp'
  225. 'chr8_del'
  226. 'chr8_loh'
  227. 'chr9_amp'
  228. 'chr9_del'
  229. 'chr9_loh'
  230. 'chr10_del'
  231. 'chr10_loh'
  232. 'chr11_del'
  233. 'chr11_loh'
  234. 'chr12_amp'
  235. 'chr12_del'
  236. 'chr12_loh'
  237. 'chr13_del'
  238. 'chr13_loh'
  239. 'chr14_amp'
  240. 'chr14_del'
  241. 'chr14_loh'
  242. 'chr15_amp'
  243. 'chr15_del'
  244. 'chr15_loh'
  245. 'chr16_loh'
  246. 'chr17_amp'
  247. 'chr17_del'
  248. 'chr17_loh'
  249. 'chr18_del'
  250. 'chr18_loh'
  251. 'chr19_amp'
  252. 'chr19_del'
  253. 'chr19_loh'
  254. 'chr20_amp'
  255. 'chr20_del'
  256. 'chr20_loh'
  257. 'chr21_del'
  258. 'chr21_loh'
  259. 'chr22_amp'
  260. 'chr22_del'
  261. 'chr22_loh'
  262. 'chr23_del'
  263. 'ch_amp'
  264. 'ch_del'
  265. 'ch_loh'
  266. 'chr1_p_del'
  267. 'chr1_p_loh'
  268. 'chr1_pq_loh'
  269. 'chr1_q_del'
  270. 'chr1_q_loh'
  271. 'chr2_p_amp'
  272. 'chr2_p_del'
  273. 'chr2_p_loh'
  274. 'chr2_q_loh'
  275. 'chr3_p_del'
  276. 'chr3_p_loh'
  277. 'chr3_pq_amp'
  278. 'chr3_q_amp'
  279. 'chr3_q_del'
  280. 'chr4_pq_loh'
  281. 'chr4_q_del'
  282. 'chr4_q_loh'
  283. 'chr5_p_del'
  284. 'chr5_pq_del'
  285. 'chr5_q_del'
  286. 'chr5_q_loh'
  287. 'chr6_p_del'
  288. 'chr6_p_loh'
  289. 'chr6_q_del'
  290. 'chr6_q_loh'
  291. 'chr7_p_del'
  292. 'chr7_q_del'
  293. 'chr7_q_loh'
  294. 'chr8_p_del'
  295. 'chr8_p_loh'
  296. 'chr8_pq_amp'
  297. 'chr8_q_amp'
  298. 'chr9_p_loh'
  299. 'chr9_q_amp'
  300. 'chr9_q_del'
  301. 'chr9_q_loh'
  302. 'chr10_p_del'
  303. 'chr10_q_del'
  304. 'chr10_q_loh'
  305. 'chr11_p_del'
  306. 'chr11_pq_loh'
  307. 'chr11_q_del'
  308. 'chr11_q_loh'
  309. 'chr12_p_del'
  310. 'chr12_p_loh'
  311. 'chr12_pq_amp'
  312. 'chr12_q_del'
  313. 'chr12_q_loh'
  314. 'chr13_q_del'
  315. 'chr13_q_loh'
  316. 'chr14_q_amp'
  317. 'chr14_q_del'
  318. 'chr14_q_loh'
  319. 'chr15_q_amp'
  320. 'chr15_q_del'
  321. 'chr15_q_loh'
  322. 'chr16_pq_loh'
  323. 'chr16_q_loh'
  324. 'chr17_p_del'
  325. 'chr17_p_loh'
  326. 'chr17_pq_loh'
  327. 'chr17_q_amp'
  328. 'chr17_q_del'
  329. 'chr17_q_loh'
  330. 'chr18_p_del'
  331. 'chr18_pq_loh'
  332. 'chr19_p_loh'
  333. 'chr19_pq_amp'
  334. 'chr19_pq_loh'
  335. 'chr19_q_del'
  336. 'chr20_p_amp'
  337. 'chr20_p_del'
  338. 'chr20_pq_del'
  339. 'chr20_q_del'
  340. 'chr20_q_loh'
  341. 'chr21_q_del'
  342. 'chr21_q_loh'
  343. 'chr22_q_amp'
  344. 'chr22_q_del'
  345. 'chr22_q_loh'
  346. 'chr23_p_del'
  347. 'chr23_pq_del'
  348. 'VAF_trans'
  349. 'mutnum_trans'
  350. 'mutnum_pd_trans'
  351. 'VAF_pd_trans'
  352. 'phi_cnv'
  353. 'phi_cnv_auto'
  354. 'n_cnv'
  355. 'n_cnv_chrom'
  356. 'ch_cnv_auto'
  357. 'age_bin'
  358. 'age10'
  359. 'age10_sq'
  360. 'loy'
  361. 'seg'
  362. 'chrom'
  363. 'start_snp'
  364. 'end_snp'
  365. 'cnlr'
  366. 'valor'
  367. 'n_marker'
  368. 'n_het'
  369. 'start'
  370. 'end'
  371. 'size'
  372. 'aberrant'
  373. 'sd_y'
  374. 'y_bar'
  375. 'x_bar'
  376. 'sem_x'
  377. 'sem_y'
  378. 'z_x'
  379. 'p_x'
  380. 'z_y'
  381. 'p_y'
  382. 'chisq'
  383. 'p_chisq'
  384. 'diplogr'
  385. 'cnlr_adj'
  386. 'type'
  387. 'phi'
  388. 'err'
  389. 'loy_logr'
  390. 'loy_dose'
  391. 'loy_bin'
  392. 'anc_n'
  393. 'alc_n'
  394. 'amc_n'
  395. 'hgb_n'
  396. 'mcv_n'
  397. 'rdw_n'
  398. 'plt_n'
  399. 'wbc_n'
  400. 'rbc_n'

Serial CBC

In [ ]:
M_wide %>% filter(ch_cnv == 1) %>% select(dmp_patient_id) # %>% fwrite('./data/cnv_cases.tsv', sep = '\t')
In [67]:
cbc_cnv = fread('./data/cbc_serial_cnv.tsv')
In [104]:
plot_cbc = function(cbc_serial) {
    cbc_serial %>%
    reshape2::melt(measure.vars = cbcs, variable.name = 'bc', value.name = 'count') %>%
    mutate(
        cytopenia = case_when(
            bc == 'plt' ~ count < 100,
            bc == 'hgb' ~ count < 10,
            bc == 'anc' ~ count < 1.8,
            T ~ F
        )
    ) %>%
    ggplot(
        aes(x = DAYSFROMIMPACT, y = count, group = bc, color = cytopenia)
    ) +
    geom_vline(xintercept = 0, linetype = 'dashed', color = 'gray') +
    geom_line() +
    geom_point(size = 1) +
    theme_classic() +
    facet_wrap(~bc, scale = 'free') +
    theme(legend.position = 'none') +
    scale_color_manual(values = c('royalblue', 'red'))
}
In [1433]:
cbcs = c("anc", "alc", "amc", "hgb", "mcv", "rdw", "plt")

CBC ~ CNV

In [339]:
cnvs = colnames(CNV_detailed)[-1][(CNV_detailed[,-1] %>% colSums) >= 5]

cnvs = cnvs[cnvs != 'chr9_loh']
cnvs = c(cnvs, 'chr9_p_loh', 'chr9_q_loh')
cbcs = cbcs[cbcs != 'rbc']

summs = data.frame()

for (bc in paste0(cbcs[!cbcs %in% c('rbc', 'wbc')], '_n')) {
    display(bc)
    for (cnv in cnvs) {
        
        summ = M_wide %>%
            lm(
                glue('{bc} ~ Gender + race_b + age10 + smoke + {cnv}'),
                data = .
            ) %>%
            sjPlot::get_model_data(type = 'est') %>%
            mutate(bc = bc)

        summs = rbind(summs, summ)
    }
}
'anc_n'
'alc_n'
'amc_n'
'hgb_n'
'mcv_n'
'rdw_n'
'plt_n'
In [340]:
options(repr.plot.width = 6, repr.plot.height = 1.8, repr.plot.res = 300)

limit = 5

D = summs %>% 
    filter(term %in% cnvs) %>%
    rowwise() %>%
    mutate(estimate = max(min(estimate, limit), -limit)) %>% 
    ungroup() %>%
    mutate(q = p.adjust(p.value, method = 'BH')) %>%
    mutate(signif_cat = case_when(
        q < 0.01 ~ 'q<0.01',
        q < 0.05 ~ 'q<0.05',
        q < 0.1 ~ 'q<0.1',
        T ~ 'ns'
    )) %>%
    mutate(
        bc = str_remove(bc, '_n'),
        term = str_remove_all(term, '_|chr'),
        term = str_replace(str_replace(str_replace(term, 'del', '-'), 'amp', '+'), 'loh', '='),
        chrom = as.integer(str_extract(term, '\\d+'))
    ) %>%
    arrange(chrom) %>%
    mutate(term = factor(term, unique(term)))

ggplot(
    D,
    aes(x = term, y = bc, fill = estimate)
) +
theme_classic() +
geom_tile(width = 0.9, height = 0.9) +
geom_point(
    data = D %>% filter(q < 0.1),
    aes(pch = signif_cat)
#         aes(size = -log10(p.value))
) +
scale_shape_manual(values = c('q<0.01' = 8, 'q<0.05' = 3, 'q<0.1' = 20), name = 'FDR') +
scale_fill_gradient2(low = 'lightblue', high = 'red', mid = 'white', na.value = 'white', midpoint = 0, limits = c(-limit, limit)) +
theme(plot.margin = unit(c(0,0,0,0), 'mm'), legend.key.size = unit(0.5, 'cm')) +
theme(
    axis.text.x = element_text(angle = 30, hjust = 0.5, size = 8),
    axis.text.y = element_text(size = 8),
    legend.key.size = unit(2.5, 'mm'),
    legend.text = element_text(size = 8),
    legend.title = element_text(size = 8),
    legend.spacing = unit(1, 'mm')
) +
xlab('') +
ylab('')

CBC ~ CNV + GM

In [369]:
combs = list(c('JAK2', 'chr9_p_loh'), c('EZH2', 'chr7_loh'),
             c('TET2', 'chr4_del', 'chr4_loh'),
             c('MPL', 'chr1_p_loh'), c('ATM', 'chr11_del', 'chr11_loh'),
            c('DNMT3A', 'chr2_del'),
            c(''))

summs = data.frame()

for (bc in paste0(cbcs[!cbcs %in% c('rbc', 'wbc')], '_n')) {
    display(bc)
    for (comb in combs) {
        
        comb = paste(comb, collapse = '+')
        
        summ = M_wide %>%
            lm(
                glue('{bc} ~ Gender + race_b + age10 + smoke + {comb}'),
                data = .
            ) %>%
            sjPlot::get_model_data(type = 'est') %>%
            mutate(bc = bc)

        summs = rbind(summs, summ)
    }
}
'anc_n'
'alc_n'
'amc_n'
'hgb_n'
'mcv_n'
'rdw_n'
'plt_n'
In [374]:
options(repr.plot.width = 6, repr.plot.height = 2, repr.plot.res = 300)

limit = 5

D = summs %>% 
    filter(!str_detect(term, 'Gender|race|smoke|age')) %>%
    rowwise() %>%
    mutate(estimate = max(min(estimate, limit), -limit)) %>% 
    ungroup() %>%
    mutate(q = p.adjust(p.value, method = 'BH')) %>%
#     mutate(q = p.value) %>%
    mutate(signif_cat = case_when(
        q < 0.01 ~ 'q<0.01',
        q < 0.05 ~ 'q<0.05',
        q < 0.1 ~ 'q<0.1',
        T ~ 'ns'
    )) %>%
    mutate(term = factor(term, unlist(combs))) %>%
    arrange(term) %>%
    mutate(
        bc = str_remove(bc, '_n'),
        term = str_remove_all(term, '_|chr'),
        term = str_replace(str_replace(str_replace(term, 'del', '-'), 'amp', '+'), 'loh', '='),
        chrom = as.integer(str_extract(term, '\\d+'))
    ) %>%
    mutate(term = factor(term, unique(term)))

ggplot(
    D,
    aes(x = term, y = bc, fill = estimate)
) +
theme_classic() +
geom_tile(width = 0.9, height = 0.9) +
geom_point(
    data = D %>% filter(q < 0.1),
    aes(pch = signif_cat)
#         aes(size = -log10(p.value))
) +
scale_shape_manual(values = c('q<0.01' = 8, 'q<0.05' = 3, 'q<0.1' = 20), name = 'FDR') +
scale_fill_gradient2(low = 'lightblue', high = 'red', mid = 'white', na.value = 'white', midpoint = 0, limits = c(-limit, limit)) +
theme(plot.margin = unit(c(0,0,0,0), 'mm'), legend.key.size = unit(0.5, 'cm')) +
theme(
    axis.text.x = element_text(angle = 30, hjust = 1, size = 8),
    axis.text.y = element_text(size = 8),
    legend.key.size = unit(2.5, 'mm'),
    legend.text = element_text(size = 8),
    legend.title = element_text(size = 8),
    legend.spacing = unit(1, 'mm')
) +
xlab('') +
ylab('')

Patient Outcome

In [24]:
p = ggplot(
    M_wide %>% 
    filter(!is.na(heme_cat2)) %>% 
    group_by(heme_cat2) %>%
    mutate(n_heme_cat = n()) %>%
    ungroup() %>%
    mutate(heme_cat2 = glue('{heme_cat2}({n_heme_cat})')) %>%
    mutate(seq_to_diag = seq_to_diag/365),
    aes(x = seq_to_diag, fill = heme_cat2)
) +
geom_histogram(bins = 50, color = 'black', size = 0.1) +
scale_y_continuous(expand = c(0,0)) +
xlab('Years since blood draw') +
theme_classic() +
theme(
    legend.title = element_blank(),
    legend.position = 'top'
) +
ylab('Cases')

do_plot(p, 'diagnosis.pdf', w = 5, h = 2.5)
In [208]:
(M_wide %>% filter(post_heme == 1) %>% pull(seq_to_diag) %>% mean)/30
20.7861111111111
In [209]:
M_wide %>% filter(post_heme == 1) %>% pull(seq_to_diag)  %>% range %>% {./30}
  1. 3.26666666666667
  2. 61.8
In [462]:
no_cnv = M_wide %>% filter(post_heme == 1 & heme_cat == 'MDS' & (!ch_cnv_auto)) %>% pull(seq_to_diag)
yes_cnv = M_wide %>% filter(post_heme == 1 & heme_cat == 'MDS' & ch_cnv_auto) %>% pull(seq_to_diag)

t.test(no_cnv, yes_cnv)
	Welch Two Sample t-test

data:  no_cnv and yes_cnv
t = 2.1031, df = 15.963, p-value = 0.05167
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  -2.609957 639.832179
sample estimates:
mean of x mean of y 
 733.6111  415.0000 
In [25]:
M_wide %>% 
    filter(post_heme == 1) %>% 
    mutate(
        heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat),
        heme_cat = ifelse(heme_cat %in% c('ALCL', 'CTCL'), 'T-NHL', heme_cat),
        heme_cat = ifelse(heme_cat %in% c('DLBCL', 'FL', 'MZL', 'PCNSL', 'WM'), 'B-NHL', heme_cat)
    ) %>%
    group_by(heme_cat) %>%
    mutate(n_heme_cat = n()) %>%
    ungroup() %>%
    mutate(seq_to_diag = seq_to_diag/30) %>% 
    group_by(heme_cat) %>%
    summarise(
        n = n(),
        seq_to_diag = mean(seq_to_diag)
    )
`summarise()` ungrouping output (override with `.groups` argument)

A tibble: 9 × 3
heme_catnseq_to_diag
<chr><int><dbl>
AML 1122.92121
B-NHL2720.06296
CLL 1218.35000
CML 420.45833
LCH 1 6.40000
MDS 2621.18590
MM 722.95714
MPN 717.92857
T-NHL 231.28333
In [26]:
M_wide %>% 
    filter(post_heme == 1) %>%
    filter(heme_cat == 'MDS') %>%
    group_by(ch_cnv_auto) %>%
    summarise(round(median(seq_to_diag)/30, 0))
`summarise()` ungrouping output (override with `.groups` argument)

A tibble: 2 × 2
ch_cnv_autoround(median(seq_to_diag)/30, 0)
<dbl><dbl>
023
110
In [30]:
gene_set = fread('./data/gene_set.tsv')$Gene
gene_set = c('DNMT3A', 'TET2', 'JAK2', 'ASXL1', 'TP53', 'ATM', 'PPM1D', 'CHEK2',
             'SRSF2', 'SF3B1', 'U2AF1', 'EZH2', 'MPL', 'NOTCH1', 'TERT', 'SUZ12', 'CBL')

patients_heme = M_wide %>% filter(post_heme == 1) %>% pull(dmp_patient_id)

segs_auto = segs_filtered %>% filter(chrom != '23')

M_co = rbind(
        M_long %>% 
        filter(Gene %in% gene_set) %>%
        filter(ch_nonsilent == 1 | VariantClass == 'Addition') %>%
        mutate(
            mutation = Gene,
            type = VariantClass2,
            arm = 'na',
            focality = 'na'
        ) %>%
        rowwise() %>% mutate(phi = min(VAF_N * 2, 1)) %>% ungroup() %>%
        select(mutation, phi, type, arm, focality, dmp_patient_id),
        segs_auto %>% 
        mutate(type_label = c('loh' = '=', 'del' = '-', 'amp' = '+')[type]) %>%
        mutate(mutation = paste0(chrom, ifelse(arm == 'p,q', '', arm), type_label)) %>%
        select(mutation, phi, type = type2, arm, focality, dmp_patient_id)
    ) %>%
    filter(dmp_patient_id %in% patients_heme) %>%
    full_join(
        M_wide %>% filter(post_heme == 1) %>%
            select(dmp_patient_id, heme_cat, seq_to_diag),
        by = 'dmp_patient_id'
    ) %>%
    filter(heme_cat != 'LCH') %>%
    mutate(
        heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat),
        heme_cat = ifelse(heme_cat %in% c('ALCL', 'CTCL'), 'T-NHL', heme_cat),
        heme_cat = ifelse(heme_cat %in% c('DLBCL', 'FL', 'MZL', 'PCNSL', 'WM'), 'B-NHL', heme_cat)
    ) %>%
    mutate(heme_cat = factor(heme_cat, c('MDS', 'MPN', 'AML', 'CML', 'CLL', 'B-NHL', 'T-NHL', 'MM'))) %>%
    mutate(mutation = ifelse(is.na(mutation), 'none', mutation))

M_co = M_co %>% mutate(
    class = case_when(
        mutation %in% c('JAK2', 'MPL', 'CHEK2', '9ploh') ~ 'MPN',
        mutation %in% c('TP53') ~ 'TP53',
        mutation %in% c('EZH2', 'SRSF2', 'U2AF1', 'SF3B1', 'ASXL1', 'SUZ12') ~ 'MDS',
        mutation %in% c('PPM1D', '7qdel') ~ 'CMML',
        mutation %in% c('NOTCH1', '12amp', '19amp', 'ATM', 'MYD88') ~ 'CLL',
        T ~ 'other'
    ),
    class_rank = as.integer(factor(class, c('MPN', 'TP53', 'MDS', 'CMML', 'CLL', 'other')))
)

mutation_order = c('DNMT3A', 'TET2', 'ASXL1', 'JAK2', 'MPL', 'CHEK2',
                   'SRSF2', 'SF3B1', 'U2AF1', 'EZH2', 'SUZ12', 'PPM1D', 'NOTCH1','ATM',
                   'MYD88', 'TERT', 'TP53')

mutation_other = M_co %>% filter(!mutation %in% mutation_order) %>% pull(mutation) %>% unique

mutation_order = c(mutation_order, mutation_other)

D = M_co %>%
#     filter(type == 'snv') %>%
    distinct(mutation, dmp_patient_id, `.keep_all` = T) %>%
    group_by(mutation) %>% mutate(mutation_n = n()) %>% ungroup() %>%
    arrange(class_rank, mutation != 'EZH2', desc(mutation_n)) %>%
    mutate(mutation = factor(mutation, rev(unique(mutation)))) %>% 
    rowwise() %>%
    mutate(mut_weight = 2^which(mutation == levels(mutation))) %>%
    ungroup() %>%
    group_by(dmp_patient_id) %>% 
    mutate(patient_weight = sum(mut_weight)) %>%
    ungroup() %>%
    group_by(dmp_patient_id) %>% 
    mutate(patient_group = min(class_rank)) %>%
    ungroup() %>%
    arrange(heme_cat, seq_to_diag, desc(patient_weight), type) %>%
    mutate(pid = as.integer(factor(dmp_patient_id, unique(dmp_patient_id)))) %>%
    mutate(mutation = as.character(mutation)) %>%
    mutate(mutation = ifelse(mutation == 'none', NA, mutation)) %>%
    mutate(mutation = factor(mutation, rev(mutation_order))) %>%
    mutate(mutation_i = as.integer(mutation))

p_mat = ggplot(
        D,
        aes(x = pid, y = mutation, fill = type)
    ) +
    geom_tile(color = 'white', size = 0.5) +
    geom_vline(
        aes(xintercept = pid - 0.5), color = 'gray95', size = 0.3
    ) +
#     geom_hline(
#         yintercept = length(levels(D$mutation)) - 1 - c(2.5), color = 'gray80',
#     ) +
    theme_classic() +
    theme(
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        plot.margin = unit(c(0,0,0,0), 'mm'),
        legend.key.size = unit(2, 'mm')
    ) +
    geom_vline(xintercept = cumsum(
        D %>% distinct(pid, heme_cat) %>% count(heme_cat) %>% pull(n)
    ) + 0.5, size = 0.2, linetype = 'dashed') +
    scale_fill_manual(
        values = c(cnv_pal2, sm_pal),
        na.translate = FALSE
    ) +
    scale_color_manual(
        values = c(cnv_pal2, sm_pal),
        na.translate = FALSE
    ) +
    scale_x_discrete(expand = expansion(add = 0)) +
    scale_y_discrete(expand = c(0.01,0), na.translate = FALSE) +
    xlab('') +
    scale_alpha_continuous(range = c(0.4,1)) +
    labs(alpha = 'Cell fraction') +
    labs(fill = 'Mutation') +
    ylab('')

p_heme = D %>% 
    mutate(seq_to_diag = seq_to_diag/365) %>%
    distinct(dmp_patient_id, pid, heme_cat, seq_to_diag) %>% 
    group_by(heme_cat) %>%
    mutate(label_pos = median(pid)) %>%
    ungroup %>%
    ggplot(
        aes(x = pid, y = 1, fill = heme_cat, alpha = seq_to_diag)
    ) +
    geom_tile(color = 'white', size = 0.5, show.legend = T) +
#     geom_vline(xintercept = cumsum(
#         D %>% distinct(pid, heme_cat) %>% count(heme_cat) %>% pull(n)
#     ) + 0.5, size = 0.2, linetype = 'dashed') +
    geom_rect(xmin = 0, xmax = max(D$pid) + 0.5, ymin = 1.5, ymax = 3, fill = 'white', show.legend = F) +
    geom_hline(yintercept = 0.5, size = 1) +
    geom_text(
        aes(x = label_pos, y = 2, label = heme_cat),
        color = 'black',
        vjust = 1,
        size = 2,
        show.legend = F
    ) +
    theme_void() +
    theme(
        axis.text.x = element_blank(),
        legend.key.size = unit(2, 'mm'),
        plot.margin = unit(c(0,0,0,0), 'mm'),
        legend.margin = margin(l = 1.7,1,1,1, "mm")
    ) +
    scale_x_discrete(expand = expansion(add = 0)) +
    scale_y_discrete(expand = expansion(add = 0)) +
    ylab('') +
    scale_fill_manual(
        values = c(' ' = 'white', 'CLL' = 'royalblue', 'MDS' = 'tomato3', 'CML' = 'yellow',
                   'AML' = 'red', 'MPN' = 'purple', 'B-NHL' = 'yellowgreen', 'T-NHL' = 'orchid', 'MM' = 'orange')
    ) +
    scale_alpha_continuous(trans = 'reverse') +
    labs(alpha = "Years to \ndiagnosis") +
    guides(fill = FALSE)

panel = (p_heme / p_mat) + plot_layout(heights = c(1,20), guides = 'collect')

do_plot(panel, 'heme_heatmap.pdf', w = 7.7, h = 5)
Warning message:
“Removed 51 rows containing missing values (geom_tile).”
Warning message:
“Removed 51 rows containing missing values (geom_tile).”
In [45]:
M_wide %>% filter(heme_cat2 == 'MPN' & chr9) %>% pull(seq_to_diag)
  1. 98
  2. 157
  3. 454
  4. 138

Risk models

In [31]:
betas = data.frame()

for (landmark in seq(90, 360, 90)) {
    
    D = M_wide %>% 
        filter(time_leukemia > landmark) %>%
        mutate(time_leukemia = time_leukemia - landmark) %>%
        rowwise() %>%
        mutate(
#             VAF_pd_10 = min(VAF_pd, 0.5)/0.1,
            VAF_pd_10 = VAF_pd/0.1
        ) %>%
        ungroup()
    
    fit = coxph(Surv(time_leukemia, post_leukemia) ~ ch_cnv_auto + VAF_pd_10 + mutnum_pd + age, data = D, na.action = 'na.omit')
    
    cox.sum = sjPlot::get_model_data(fit, type = 'est') %>% mutate(with_cnv = 0, landmark = landmark)
    
    betas = rbind(
        betas,
        cox.sum
    )
}
Registered S3 methods overwritten by 'lme4':
  method                          from
  cooks.distance.influence.merMod car 
  influence.merMod                car 
  dfbeta.influence.merMod         car 
  dfbetas.influence.merMod        car 

In [32]:
p = betas %>%
    filter(!(term %in% c('GenderMale', 'age'))) %>%
    mutate(landmark = paste0(landmark, ' days')) %>%
    mutate(landmark = factor(landmark, unique(landmark))) %>%
    mutate(term = c('VAF_pd_10' = 'PD VAF', 'mutnum_pd' = 'PD Mutation Number', 'ch_cnv_auto' = 'mCA')[as.character(term)]) %>%
    mutate(term = factor(term, unique(term))) %>%
    plot_forest(
        x = 'term',
        label = 'p.stars',
        eb_w = 0,
        eb_s = 1,
        ps = 2,
        or_s = 3,
        nudge = 0.1,
        col = 'landmark'
    ) +
    theme_classic() +
    ylab('HR') +
    xlab('') +
    scale_color_discrete(name = 'Landmark')

do_plot(p, 'landmark.pdf', w = 4.5, h = 3)
In [33]:
options(repr.plot.width = 4, repr.plot.height = 2, repr.plot.res = 300)

p = betas %>%
    filter(landmark == 270) %>%
    filter(!(term %in% c('GenderMale', 'age'))) %>%
    mutate(landmark = paste0(landmark, ' days')) %>%
    mutate(landmark = factor(landmark, unique(landmark))) %>%
    mutate(term = c('VAF_pd_10' = 'PD VAF', 'mutnum_pd' = 'PD Mutation Number', 'ch_cnv_auto' = 'mCA')[as.character(term)]) %>%
    mutate(term = factor(term, unique(term))) %>%
    plot_forest(
        x = 'term',
        label = 'p.stars',
        eb_w = 0,
        eb_s = 1,
        ps = 2,
        or_s = 3,
        nudge = 0.1
    ) +
    theme_classic() +
    ylab('HR') +
    xlab('')

do_plot(p, 'cox.pdf', w = 4, h = 1.8)
In [47]:
betas %>%
filter(landmark == 270)
A data.frame: 4 × 16
termestimatestd.errorconf.lowconf.highstatisticdf.errorp.valuep.starsp.labelgroupxposxminxmaxwith_cnvlandmark
<fct><dbl><dbl><dbl><dbl><dbl><int><dbl><chr><chr><chr><dbl><dbl><dbl><dbl><dbl>
ch_cnv_auto14.06701050.434849955.998696232.987299 6.0798728224241.202779e-09***14.07 ***pos43.8254.1750270
VAF_pd_10 1.55717070.143066591.1764073 2.061174 3.0955552224241.964448e-03** 1.56 ** pos32.8253.1750270
mutnum_pd 2.06711480.114703501.6509306 2.588215 6.3307035224242.440459e-10***2.07 *** pos21.8252.1750270
age 0.99479350.012439320.9708331 1.019345-0.4196436224246.747458e-01 0.99 neg10.8251.1750270

Cumulative incidence

In [35]:
# data frame for Sean
M_wide %>% 
select(dmp_patient_id, ch_cnv_auto, ch_nonsilent, DateofProcedure, date_death,
       date_lastfu, post_leukemia, post_mn, seq_to_diag) %>%
#fwrite('./data/survival_sean.tsv', sep = '\t')
In [3]:
library(prodlim)
library(cmprsk)

data001 = M_wide %>% mutate(seq_to_diag = ifelse(post_leukemia, seq_to_diag, NA))

OSDays <- as.numeric(as.Date(data001$date_lastfu , "%Y-%m-%d") - as.Date(data001$DateofProcedure , "%Y-%m-%d"))
Dead <- 1*(!is.na(as.Date(data001$date_death , "%Y-%m-%d")))

TimetoAnalysis <- data001$seq_to_diag
TimetoAnalysis[is.na(TimetoAnalysis)] <- OSDays[is.na(TimetoAnalysis)]

YearstoAnalysis <- TimetoAnalysis/365
data001$YearstoAnalysis <- YearstoAnalysis

EventAnalysis <- 2*Dead
EventAnalysis[!is.na(data001$seq_to_diag)] <- 1
data001$EventAnalysis <- EventAnalysis

CH <- rep("None", length(data001$ch_nonsilent))
CH[data001$ch_pd ==0 & data001$ch_cnv_auto==1] <- "mCA ONLY"
CH[data001$ch_pd ==1 & data001$ch_cnv_auto==0] <- "GM ONLY"
CH[data001$ch_pd ==1 & data001$ch_cnv_auto==1] <- "mCA+GM"

data001$CH <- CH

data001 = data001 %>% mutate(CH = factor(CH, c('None', 'GM ONLY', 'mCA ONLY', 'mCA+GM')))

data002 <- data001[data001$YearstoAnalysis >0 , ] ## need to remove the <= 0 times
dim(data002)

cmv <- prodlim(Hist(YearstoAnalysis, EventAnalysis) ~CH, data=data002)
summary(cmv,3) ### if the lower confidence interval =0 in the output just put at <0.001
  1. 31421
  2. 1830

----------> Cause:  1 

CH=None :
  time n.risk n.event n.lost  cuminc se.cuminc   lower   upper
1    3   4686       0      8 0.00169  0.000338 0.00102 0.00235

CH=GM ONLY :
  time n.risk n.event n.lost  cuminc se.cuminc   lower   upper
1    3    815       0      3 0.00346   0.00109 0.00133 0.00559

CH=mCA ONLY :
  time n.risk n.event n.lost  cuminc se.cuminc lower  upper
1    3     22       0      0 0.00647   0.00645     0 0.0191

CH=mCA+GM :
  time n.risk n.event n.lost cuminc se.cuminc  lower upper
1    3     12       0      0  0.146    0.0397 0.0686 0.224



----------> Cause:  2 

CH=None :
  time n.risk n.event n.lost cuminc se.cuminc lower upper
1    3   4686       3      8  0.463   0.00402 0.455 0.471

CH=GM ONLY :
  time n.risk n.event n.lost cuminc se.cuminc lower upper
1    3    815       0      3  0.545   0.00875 0.528 0.563

CH=mCA ONLY :
  time n.risk n.event n.lost cuminc se.cuminc lower upper
1    3     22       0      0    0.5    0.0488 0.404 0.595

CH=mCA+GM :
  time n.risk n.event n.lost cuminc se.cuminc lower upper
1    3     12       0      0  0.521    0.0591 0.405 0.637
In [4]:
par(mar = c(3,2,1,1))

pdf(file = './figures/cum_inc.pdf', width = 6, height = 6)

plot(cmv, 
     xlim=c(0, 4), ylim=c(0,0.25),
     xlab = "Years from blooddraw", ylab = "Cumulative Incidence",
     atRisk.at = seq(0, 4, 1),
     axis1.at = seq(0, 4, 1),
     atrisk.labels = "",  
     legend.cex = 1,
     col = c('#abdda4', '#2b83ba','#fdae61', '#d7191c'),
     legend.x="topleft") 
title(main="", cex=3)

dev.off()

display_pdf(file = './figures/cum_inc.pdf')
png: 2
In [38]:
## P-value
print(cuminc(YearstoAnalysis, EventAnalysis, CH))
## P-value is <0.001 [corresponding test statistics is 174.7842, under 3 d.f.s
3 cases omitted due to missing values
Tests:
      stat pv df
1 867.4974  0  3
2 127.7302  0  3
Estimates and Variances:
$est
                      1           2           3           4           5
None 1     0.0003060585 0.001065549 0.001686634 0.002109205 0.002474564
SM ONLY 1  0.0008483407 0.001463356 0.003462066 0.004811257 0.004811257
mCA ONLY 1 0.0064342878 0.006434288 0.006434288 0.006434288 0.006434288
mCA+SM 1   0.0719910198 0.125408042 0.146466522 0.174168451          NA
None 2     0.2048790046 0.363604942 0.462831935 0.534147827 0.594140730
SM ONLY 2  0.2690778071 0.446743001 0.545409507 0.610260933 0.668383542
mCA ONLY 2 0.2730613540 0.422159386 0.502780017 0.555364199 0.664914577
mCA+SM 2   0.1914065690 0.434278287 0.521110331 0.554968244          NA
                    6
None 1     0.00325831
SM ONLY 1          NA
mCA ONLY 1         NA
mCA+SM 1           NA
None 2     0.63674892
SM ONLY 2          NA
mCA ONLY 2         NA
mCA+SM 2           NA

$var
                      1            2            3            4            5
None 1     1.341033e-08 5.810790e-08 1.140088e-07 1.735935e-07 3.069397e-07
SM ONLY 1  1.802115e-07 3.691778e-07 1.182146e-06 2.087657e-06 2.087657e-06
mCA ONLY 1 4.143285e-05 4.143285e-05 4.143285e-05 4.143285e-05 4.143285e-05
mCA+SM 1   6.125324e-04 1.241108e-03 1.626321e-03 2.288499e-03           NA
None 2     7.175948e-06 1.222230e-05 1.619844e-05 2.202069e-05 3.634250e-05
SM ONLY 2  4.080523e-05 6.125538e-05 7.654189e-05 9.771437e-05 1.701055e-04
mCA ONLY 2 1.314698e-03 1.926157e-03 2.401558e-03 3.252422e-03 1.384196e-02
mCA+SM 2   1.373194e-03 2.728885e-03 3.625404e-03 4.252127e-03           NA
                      6
None 1     9.206002e-07
SM ONLY 1            NA
mCA ONLY 1           NA
mCA+SM 1             NA
None 2     9.705606e-05
SM ONLY 2            NA
mCA ONLY 2           NA
mCA+SM 2             NA

Tally

In [1471]:
segs_filtered %>%
filter(post_anyblood == 1) %>%
mutate(heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat)) %>%
reshape2::dcast(heme_cat ~ chrom + type, fun.aggregate = function(x){length(unique(x))}, value.var = 'dmp_patient_id')
A data.frame: 5 × 9
heme_cat1_del1_loh2_del5_del7_loh12_amp13_loh14_amp
<chr><int><int><int><int><int><int><int><int>
CLL10000110
FL 00010000
MDS00101000
MPN01000000
MZL00000001
In [39]:
segs_filtered %>%
filter(post_anyblood == 1) %>%
mutate(heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat)) %>%
reshape2::dcast(heme_cat ~ ., fun.aggregate = function(x){length(unique(x))}, value.var = 'dmp_patient_id')
A data.frame: 6 × 2
heme_cat.
<chr><int>
CLL 2
DLBCL1
FL 1
MDS 8
MPN 5
MZL 1
In [1484]:
M_wide %>%
mutate(heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat)) %>%
filter(post_anyblood == 1) %>% 
group_by(heme_cat) %>%
summarise(
    total = n(),
    n_ai = sum(ch_cnv_auto),
    n_sm = sum(ch_nonsilent),
    n_ai_only = sum(ch_cnv_auto & (!ch_nonsilent))
)
A tibble: 11 × 5
heme_cattotaln_ain_smn_ai_only
<chr><int><int><int><int>
AML 9020
CLL 6260
CML 3010
CTCL 1010
DLBCL 2010
FL 5120
MDS 17260
MM 3020
MPN 3130
MZL 6150
PCNSL 2010
In [40]:
M_wide %>%
filter(!is.na(heme_cat2)) %>% 
summarise(
    total = n(),
    n_ai = sum(ch_cnv_auto),
    prop_ai = n_ai/total,
    n_sm = sum(ch_nonsilent),
    n_sm_only = sum(ch_nonsilent & (!ch_cnv_auto)),
    prop_sm_only = n_sm_only/total,
    n_ai_only = sum(ch_cnv_auto & (!ch_nonsilent)),
    prop_ai_only = n_ai_only/total
)

M_wide %>%
filter(!is.na(heme_cat2)) %>% 
group_by(heme_cat2) %>%
summarise(
    total = n(),
    n_ai = sum(ch_cnv_auto),
    n_sm = sum(ch_nonsilent),
    n_sm_only = sum(ch_nonsilent & (!ch_cnv_auto)),
    n_ai_only = sum(ch_cnv_auto & (!ch_nonsilent))
)
A tibble: 1 × 8
totaln_aiprop_ain_smn_sm_onlyprop_sm_onlyn_ai_onlyprop_ai_only
<int><dbl><dbl><int><int><dbl><int><dbl>
96180.187557400.416666710.01041667
`summarise()` ungrouping output (override with `.groups` argument)

A tibble: 8 × 6
heme_cat2totaln_ain_smn_sm_onlyn_ai_only
<fct><int><dbl><int><int><int>
MDS 26814 71
MPN 75 7 20
AML 110 3 30
CML 40 2 20
CLL 12210 80
B-NHL27315120
T-NHL 20 1 10
MM 70 5 50
In [441]:
M_wide %>% count(ch_cnv)
A tibble: 2 × 2
ch_cnvn
<dbl><int>
032095
1 349
In [450]:
M_wide %>% filter((JAK2) & chr9_p_loh) %>% filter(post_heme == 1) %>% pull(VAF_JAK2)
  1. 0.59385
  2. 0.29412
  3. 0.64407

Overall Survival

In [152]:
colnames(M_wide)
  1. 'dmp_patient_id'
  2. 'ABL1'
  3. 'ACVR1'
  4. 'AKT1'
  5. 'AKT2'
  6. 'AKT3'
  7. 'ALK'
  8. 'ALOX12B'
  9. 'AMER1'
  10. 'ANKRD11'
  11. 'APC'
  12. 'AR'
  13. 'ARAF'
  14. 'ARID1A'
  15. 'ARID1B'
  16. 'ARID2'
  17. 'ARID5B'
  18. 'ASXL1'
  19. 'ASXL2'
  20. 'ATM'
  21. 'ATR'
  22. 'ATRX'
  23. 'AURKA'
  24. 'AURKB'
  25. 'AXIN1'
  26. 'AXIN2'
  27. 'AXL'
  28. 'B2M'
  29. 'BABAM1'
  30. 'BAP1'
  31. 'BARD1'
  32. 'BBC3'
  33. 'BCL10'
  34. 'BCL2'
  35. 'BCL2L11'
  36. 'BCL6'
  37. 'BCOR'
  38. 'BIRC3'
  39. 'BLM'
  40. 'BMPR1A'
  41. 'BRAF'
  42. 'BRCA1'
  43. 'BRCA2'
  44. 'BRD4'
  45. 'BRIP1'
  46. 'BTK'
  47. 'CALR'
  48. 'CARD11'
  49. 'CARM1'
  50. 'CASP8'
  51. 'CBFB'
  52. 'CBL'
  53. 'CCND1'
  54. 'CCND2'
  55. 'CCND3'
  56. 'CCNE1'
  57. 'CD274'
  58. 'CD276'
  59. 'CD79A'
  60. 'CD79B'
  61. 'CDC42'
  62. 'CDC73'
  63. 'CDH1'
  64. 'CDK12'
  65. 'CDK4'
  66. 'CDK6'
  67. 'CDK8'
  68. 'CDKN1A'
  69. 'CDKN1B'
  70. 'CDKN2B'
  71. 'CDKN2C'
  72. 'CEBPA'
  73. 'CENPA'
  74. 'CHEK1'
  75. 'CHEK2'
  76. 'CIC'
  77. 'CREBBP'
  78. 'CRKL'
  79. 'CRLF2'
  80. 'CSDE1'
  81. 'CSF1R'
  82. 'CSF3R'
  83. 'CTCF'
  84. 'CTLA4'
  85. 'CTNNB1'
  86. 'CUL3'
  87. 'CXCR4'
  88. 'CYLD'
  89. 'CYSLTR2'
  90. 'DAXX'
  91. 'DCUN1D1'
  92. 'DDR2'
  93. 'DICER1'
  94. 'DIS3'
  95. 'DNAJB1'
  96. 'DNMT1'
  97. 'DNMT3A'
  98. 'DNMT3B'
  99. 'DOT1L'
  100. 'DROSHA'
  101. 'DUSP4'
  102. 'E2F3'
  103. 'EED'
  104. 'EGFL7'
  105. 'EGFR'
  106. 'EIF1AX'
  107. 'EIF4A2'
  108. 'ELF3'
  109. 'EP300'
  110. 'EPAS1'
  111. 'EPCAM'
  112. 'EPHA3'
  113. 'EPHA5'
  114. 'EPHA7'
  115. 'EPHB1'
  116. 'ERBB2'
  117. 'ERBB3'
  118. 'ERBB4'
  119. 'ERCC2'
  120. 'ERCC3'
  121. 'ERCC4'
  122. 'ERCC5'
  123. 'ERF'
  124. 'ERG'
  125. 'ERRFI1'
  126. 'ESR1'
  127. 'ETV1'
  128. 'ETV6'
  129. 'EZH1'
  130. 'EZH2'
  131. 'FAM175A'
  132. 'FAM46C'
  133. 'FAM58A'
  134. 'FANCA'
  135. 'FANCC'
  136. 'FAT1'
  137. 'FBXW7'
  138. 'FGF19'
  139. 'FGF3'
  140. 'FGF4'
  141. 'FGFR1'
  142. 'FGFR2'
  143. 'FGFR3'
  144. 'FGFR4'
  145. 'FH'
  146. 'FLCN'
  147. 'FLT1'
  148. 'FLT3'
  149. 'FLT4'
  150. 'FOXA1'
  151. 'FOXL2'
  152. 'FOXO1'
  153. 'FOXP1'
  154. 'FUBP1'
  155. 'FYN'
  156. 'GATA1'
  157. 'GATA2'
  158. 'GATA3'
  159. 'GLI1'
  160. 'GNA11'
  161. 'GNAQ'
  162. 'GNAS'
  163. 'GPS2'
  164. 'GREM1'
  165. 'GRIN2A'
  166. 'GSK3B'
  167. 'H3F3A'
  168. 'H3F3B'
  169. 'H3F3C'
  170. 'HGF'
  171. 'HIST1H1C'
  172. 'HIST1H2BD'
  173. 'HIST1H3A'
  174. 'HIST1H3B'
  175. 'HIST1H3C'
  176. 'HIST1H3D'
  177. 'HIST1H3E'
  178. 'HIST1H3F'
  179. 'HIST1H3G'
  180. 'HIST1H3H'
  181. 'HIST1H3I'
  182. 'HIST1H3J'
  183. 'HIST3H3'
  184. 'HLA-A'
  185. 'HLA-B'
  186. 'HNF1A'
  187. 'HOXB13'
  188. 'HRAS'
  189. 'ICOSLG'
  190. 'ID3'
  191. 'IDH1'
  192. 'IDH2'
  193. 'IFNGR1'
  194. 'IGF1'
  195. 'IGF1R'
  196. 'IGF2'
  197. 'IKBKE'
  198. 'IKZF1'
  199. 'IL10'
  200. 'IL7R'
  201. 'chr18'
  202. 'chr19'
  203. 'chr20'
  204. 'chr21'
  205. 'chr22'
  206. 'chr23'
  207. 'ch_cnv'
  208. 'chr1_del'
  209. 'chr1_loh'
  210. 'chr2_amp'
  211. 'chr2_del'
  212. 'chr2_loh'
  213. 'chr3_amp'
  214. 'chr3_del'
  215. 'chr3_loh'
  216. 'chr4_del'
  217. 'chr4_loh'
  218. 'chr5_del'
  219. 'chr5_loh'
  220. 'chr6_del'
  221. 'chr6_loh'
  222. 'chr7_del'
  223. 'chr7_loh'
  224. 'chr8_amp'
  225. 'chr8_del'
  226. 'chr8_loh'
  227. 'chr9_amp'
  228. 'chr9_del'
  229. 'chr9_loh'
  230. 'chr10_del'
  231. 'chr10_loh'
  232. 'chr11_del'
  233. 'chr11_loh'
  234. 'chr12_amp'
  235. 'chr12_del'
  236. 'chr12_loh'
  237. 'chr13_del'
  238. 'chr13_loh'
  239. 'chr14_amp'
  240. 'chr14_del'
  241. 'chr14_loh'
  242. 'chr15_amp'
  243. 'chr15_del'
  244. 'chr15_loh'
  245. 'chr16_loh'
  246. 'chr17_amp'
  247. 'chr17_del'
  248. 'chr17_loh'
  249. 'chr18_del'
  250. 'chr18_loh'
  251. 'chr19_amp'
  252. 'chr19_del'
  253. 'chr19_loh'
  254. 'chr20_amp'
  255. 'chr20_del'
  256. 'chr20_loh'
  257. 'chr21_del'
  258. 'chr21_loh'
  259. 'chr22_amp'
  260. 'chr22_del'
  261. 'chr22_loh'
  262. 'chr23_del'
  263. 'ch_amp'
  264. 'ch_del'
  265. 'ch_loh'
  266. 'chr1_p_del'
  267. 'chr1_p_loh'
  268. 'chr1_pq_loh'
  269. 'chr1_q_del'
  270. 'chr1_q_loh'
  271. 'chr2_p_amp'
  272. 'chr2_p_del'
  273. 'chr2_p_loh'
  274. 'chr2_q_loh'
  275. 'chr3_p_del'
  276. 'chr3_p_loh'
  277. 'chr3_pq_amp'
  278. 'chr3_q_amp'
  279. 'chr3_q_del'
  280. 'chr4_pq_loh'
  281. 'chr4_q_del'
  282. 'chr4_q_loh'
  283. 'chr5_p_del'
  284. 'chr5_pq_del'
  285. 'chr5_q_del'
  286. 'chr5_q_loh'
  287. 'chr6_p_del'
  288. 'chr6_p_loh'
  289. 'chr6_q_del'
  290. 'chr6_q_loh'
  291. 'chr7_p_del'
  292. 'chr7_q_del'
  293. 'chr7_q_loh'
  294. 'chr8_p_del'
  295. 'chr8_p_loh'
  296. 'chr8_pq_amp'
  297. 'chr8_q_amp'
  298. 'chr9_p_loh'
  299. 'chr9_q_amp'
  300. 'chr9_q_del'
  301. 'chr9_q_loh'
  302. 'chr10_p_del'
  303. 'chr10_q_del'
  304. 'chr10_q_loh'
  305. 'chr11_p_del'
  306. 'chr11_pq_loh'
  307. 'chr11_q_del'
  308. 'chr11_q_loh'
  309. 'chr12_p_del'
  310. 'chr12_p_loh'
  311. 'chr12_pq_amp'
  312. 'chr12_q_del'
  313. 'chr12_q_loh'
  314. 'chr13_q_del'
  315. 'chr13_q_loh'
  316. 'chr14_q_amp'
  317. 'chr14_q_del'
  318. 'chr14_q_loh'
  319. 'chr15_q_amp'
  320. 'chr15_q_del'
  321. 'chr15_q_loh'
  322. 'chr16_pq_loh'
  323. 'chr16_q_loh'
  324. 'chr17_p_del'
  325. 'chr17_p_loh'
  326. 'chr17_pq_loh'
  327. 'chr17_q_amp'
  328. 'chr17_q_del'
  329. 'chr17_q_loh'
  330. 'chr18_p_del'
  331. 'chr18_pq_loh'
  332. 'chr19_p_loh'
  333. 'chr19_pq_amp'
  334. 'chr19_pq_loh'
  335. 'chr19_q_del'
  336. 'chr20_p_amp'
  337. 'chr20_p_del'
  338. 'chr20_pq_del'
  339. 'chr20_q_del'
  340. 'chr20_q_loh'
  341. 'chr21_q_del'
  342. 'chr21_q_loh'
  343. 'chr22_q_amp'
  344. 'chr22_q_del'
  345. 'chr22_q_loh'
  346. 'chr23_p_del'
  347. 'chr23_pq_del'
  348. 'VAF_trans'
  349. 'mutnum_trans'
  350. 'mutnum_pd_trans'
  351. 'VAF_pd_trans'
  352. 'phi_cnv'
  353. 'phi_cnv_auto'
  354. 'n_cnv'
  355. 'n_cnv_chrom'
  356. 'ch_cnv_auto'
  357. 'age_bin'
  358. 'age10'
  359. 'age10_sq'
  360. 'loy'
  361. 'seg'
  362. 'chrom'
  363. 'start_snp'
  364. 'end_snp'
  365. 'cnlr'
  366. 'valor'
  367. 'n_marker'
  368. 'n_het'
  369. 'start'
  370. 'end'
  371. 'size'
  372. 'aberrant'
  373. 'sd_y'
  374. 'y_bar'
  375. 'x_bar'
  376. 'sem_x'
  377. 'sem_y'
  378. 'z_x'
  379. 'p_x'
  380. 'z_y'
  381. 'p_y'
  382. 'chisq'
  383. 'p_chisq'
  384. 'diplogr'
  385. 'cnlr_adj'
  386. 'type'
  387. 'phi'
  388. 'err'
  389. 'loy_logr'
  390. 'loy_dose'
  391. 'loy_bin'
  392. 'anc_n'
  393. 'alc_n'
  394. 'amc_n'
  395. 'hgb_n'
  396. 'mcv_n'
  397. 'rdw_n'
  398. 'plt_n'
  399. 'wbc_n'
  400. 'rbc_n'
In [151]:
options(repr.plot.width = 8, repr.plot.height = 5, repr.plot.res = 300)

fit = M_wide %>%
    survfit(formula = Surv(time_lastfu, death_status) ~ ch_cnv_auto, data = .)

size_coef = 0.3

p = ggsurvplot(
    fit,
    # Defining plotting parameters
    data = M_wide,
    size = 3 * size_coef,              # Change line size
    fontsize = 10 * size_coef,           # Size of the table's labels
    palette = 'Dark2',
    ggtheme = theme_classic(),         # Change ggplot2 theme
    tables.theme = clean_theme(),    # Change table's theme
    # Adding useful info to the plot
#     conf.int = TRUE,         # Add confidence interval
    pval = TRUE,               # Add p-value
    pval.size = 12 * size_coef,
    risk.table = TRUE,         # Add risk table
    # Adding labels and title
    #  title = paste0(g," survival curves"),
#     legend.labs = c("Normal", "CH-SM", "CH-CNV"),    # Change legend labels
    legend.title = "",
    legend = "right",
    xlab = "Time in days",    # Customize X axis label
    ylab = "tMN free proportion",    # Customize Y axis label
    # Defining risk table plotting parameters
    risk.table.col = "strata",        # Risk table color by groups
    risk.table.height = 0.25,   # Useful to change when you have multiple groups
    risk.table.y.text = FALSE  # Displaying colors instead of name in risk table
)

do_plot(p, "R1_overall_survival.pdf", w = 5, h = 5)
Error in Surv(time_lastfu, death_status): object 'death_status' not found
Traceback:

1. M_wide %>% survfit(formula = Surv(time_lastfu, death_status) ~ 
 .     ch_cnv_auto, data = .)
2. withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
3. eval(quote(`_fseq`(`_lhs`)), env, env)
4. eval(quote(`_fseq`(`_lhs`)), env, env)
5. `_fseq`(`_lhs`)
6. freduce(value, `_function_list`)
7. withVisible(function_list[[k]](value))
8. function_list[[k]](value)
9. survfit(formula = Surv(time_lastfu, death_status) ~ ch_cnv_auto, 
 .     data = .)
10. survfit.formula(formula = Surv(time_lastfu, death_status) ~ ch_cnv_auto, 
  .     data = .)
11. eval.parent(temp)
12. eval(expr, p)
13. eval(expr, p)
14. stats::model.frame(formula = Surv(time_lastfu, death_status) ~ 
  .     ch_cnv_auto, data = .)
15. model.frame.default(formula = Surv(time_lastfu, death_status) ~ 
  .     ch_cnv_auto, data = .)
16. eval(predvars, data, env)
17. eval(predvars, data, env)
18. Surv(time_lastfu, death_status)
In [ ]: